{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4c415cae-9ab3-4f34-83d2-2609f63af97a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import pytz\n",
    "from scapy.all import Ether, CookedLinux, Raw\n",
    "import re\n",
    "import logging\n",
    "import pyarrow\n",
    "import random\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e53a1d02-f035-44d8-a9ab-4ddc468393f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "edt = pytz.timezone('US/Eastern')\n",
    "def write_log(message):\n",
    "  current_time = str(datetime.now(edt).strftime('%Y-%m-%d %H:%M:%S'))\n",
    "  f = open(\"LOG_UNSW.txt\", \"a\")\n",
    "  f.write(current_time + ' : ' + message + '\\n')\n",
    "  f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ada6327b-29d5-404c-968d-7a5e12e0c6d7",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Create CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "68edd80d-5058-4483-a671-869898c04af5",
   "metadata": {},
   "outputs": [],
   "source": [
    "main_df = pd.read_csv('UNSW.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f973b7b3-d0e6-461c-82a7-1f42c72faa01",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>payload</th>\n",
       "      <th>t_delta</th>\n",
       "      <th>stime_flow</th>\n",
       "      <th>attack_cat</th>\n",
       "      <th>label</th>\n",
       "      <th>ltime_max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1421927377</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1500...</td>\n",
       "      <td>0201002cc0a8f1f30000000089d8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1421927377</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045c00040ef1400...</td>\n",
       "      <td>0201002cc0a8f1f300000000ead8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1421927387</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045c00040ef1600...</td>\n",
       "      <td>0201002cc0a8f1f300000000ead8000000000000000000...</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1421927387</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1700...</td>\n",
       "      <td>0201002cc0a8f1f30000000089d8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1421927397</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1900...</td>\n",
       "      <td>0201002cc0a8f1f30000000089d8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759341</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc6d5e40...</td>\n",
       "      <td>f8d7033e6b99c39e74a4671e7fe282b3d233e5647a2124...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759342</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dc6d5f40...</td>\n",
       "      <td>6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759343</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc6d5f40...</td>\n",
       "      <td>6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759344</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc6d5840...</td>\n",
       "      <td>ddb752b3de517a85324680f2342a51a774932569685b10...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759345</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450000346c6040...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>179759346 rows × 16 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                stime          srcip  sport       dstip  dsport protocol_m  \\\n",
       "0          1421927377    10.40.182.1      0   224.0.0.5       0       ospf   \n",
       "1          1421927377     10.40.85.1      0   224.0.0.5       0       ospf   \n",
       "2          1421927387     10.40.85.1      0   224.0.0.5       0       ospf   \n",
       "3          1421927387    10.40.182.1      0   224.0.0.5       0       ospf   \n",
       "4          1421927397    10.40.182.1      0   224.0.0.5       0       ospf   \n",
       "...               ...            ...    ...         ...     ...        ...   \n",
       "179759341  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759342  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759343  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759344  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759345  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "\n",
       "           sttl  total_len   first_layer  \\\n",
       "0             1         64  cooked linux   \n",
       "1             1         64  cooked linux   \n",
       "2             1         64  cooked linux   \n",
       "3             1         64  cooked linux   \n",
       "4             1         64  cooked linux   \n",
       "...         ...        ...           ...   \n",
       "179759341    30       1500  cooked linux   \n",
       "179759342    29       1500  cooked linux   \n",
       "179759343    30       1500  cooked linux   \n",
       "179759344    30       1500  cooked linux   \n",
       "179759345    29         52  cooked linux   \n",
       "\n",
       "                                                      packet  \\\n",
       "0          000400010006005056a524c20000080045c00040ef1500...   \n",
       "1          000400010006005056a577630000080045c00040ef1400...   \n",
       "2          000400010006005056a577630000080045c00040ef1600...   \n",
       "3          000400010006005056a524c20000080045c00040ef1700...   \n",
       "4          000400010006005056a524c20000080045c00040ef1900...   \n",
       "...                                                      ...   \n",
       "179759341  000000010006001b17059e1c00000800450005dc6d5e40...   \n",
       "179759342  000400010006005056a5776300000800450005dc6d5f40...   \n",
       "179759343  000000010006001b17059e1c00000800450005dc6d5f40...   \n",
       "179759344  000000010006001b17059e1c00000800450005dc6d5840...   \n",
       "179759345  000400010006005056a5776300000800450000346c6040...   \n",
       "\n",
       "                                                     payload  t_delta  \\\n",
       "0          0201002cc0a8f1f30000000089d8000000000000000000...      0.0   \n",
       "1          0201002cc0a8f1f300000000ead8000000000000000000...      0.0   \n",
       "2          0201002cc0a8f1f300000000ead8000000000000000000...      6.0   \n",
       "3          0201002cc0a8f1f30000000089d8000000000000000000...      0.0   \n",
       "4          0201002cc0a8f1f30000000089d8000000000000000000...      0.0   \n",
       "...                                                      ...      ...   \n",
       "179759341  f8d7033e6b99c39e74a4671e7fe282b3d233e5647a2124...      0.0   \n",
       "179759342  6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...      0.0   \n",
       "179759343  6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...      0.0   \n",
       "179759344  ddb752b3de517a85324680f2342a51a774932569685b10...      0.0   \n",
       "179759345                                                NaN      0.0   \n",
       "\n",
       "             stime_flow attack_cat  label     ltime_max  \n",
       "0          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "1          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "2          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "3          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "4          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "...                 ...        ...    ...           ...  \n",
       "179759341  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759342  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759343  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759344  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759345  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "\n",
       "[179759346 rows x 16 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "main_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c457b72-fc3f-479b-a4c1-adea8e24bb59",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: DNS decompression loop detected\n",
      "WARNING: DNS decompression loop detected\n",
      "WARNING: DNS decompression loop detected\n",
      "WARNING: DNS decompression loop detected\n",
      "WARNING: DNS decompression loop detected\n",
      "WARNING: DNS decompression loop detected\n"
     ]
    }
   ],
   "source": [
    "chunk = 1\n",
    "chunksize = 10_000_000\n",
    "\n",
    "for main_df in pd.read_csv('UNSW.csv', chunksize=chunksize):\n",
    "    \n",
    "    write_log(f'<<<<<<<<----- Started Processing Chunk {chunk} ----->>>>>>>>')\n",
    "    \n",
    "    if os.path.isfile(f'./UNSW/output{chunk}.csv'):\n",
    "        write_log(f'------------ Skipping DataFrame {chunk} as CSV File already exists ------------')\n",
    "        chunk += 1\n",
    "        continue\n",
    "\n",
    "    packet_info = []\n",
    "    packet_details = []\n",
    "    log_records = 100000\n",
    "\n",
    "    count = 0\n",
    "    for i in range(len(main_df)):\n",
    "        packet_type = main_df.iloc[i,8]\n",
    "        packet_bytes = bytes.fromhex(main_df.iloc[i,9])\n",
    "\n",
    "        if packet_type == 'cooked linux':\n",
    "            packet = CookedLinux(packet_bytes)\n",
    "        elif packet_type == 'Ethernet':\n",
    "            packet = Ether(packet_bytes)\n",
    "        else:\n",
    "            print('Error -> First Layer is not valid')\n",
    "        details = packet.show(dump=True)\n",
    "        packet_info.append(details)\n",
    "        count += 1\n",
    "        if count%log_records == 0:\n",
    "          write_log(f'Packets Parsed: {count}')\n",
    "\n",
    "    write_log(f'------------ All Packets Parsed Successfully for Chunk {chunk} ------------')\n",
    "\n",
    "    count = 0\n",
    "    for packet in packet_info:\n",
    "        fields_values = {}\n",
    "        current_layer = \"\"\n",
    "        for line in packet.split(\"\\n\"):\n",
    "            if line.startswith(\"###[\") and \"]\" in line:\n",
    "                current_layer = line.split(\"]\")[0].split(\"[\")[1].strip()\n",
    "                fields_values[current_layer] = {}\n",
    "            elif current_layer != \"\":\n",
    "                matches = re.findall(r\"\\s+([a-z_]+)\\s+=\\s+(.+)\", line)\n",
    "                for match in matches:\n",
    "                    field_name = match[0]\n",
    "                    field_value = match[1]\n",
    "                    fields_values[current_layer][field_name] = field_value\n",
    "        packet_details.append(fields_values)\n",
    "        count += 1\n",
    "        if count%log_records == 0:\n",
    "          write_log(f'Packet Fields Parsed: {count}')\n",
    "\n",
    "    write_log(f'------------ All Packet Fields Parsed Successfully for Chunk {chunk} ------------')\n",
    "\n",
    "    count = 0\n",
    "    df_list = []\n",
    "    for fields_values in packet_details:\n",
    "        row = {}\n",
    "        for layer, fields in fields_values.items():\n",
    "            for field in fields:\n",
    "                column_name = f\"{layer} {field}\"\n",
    "                row[column_name] = fields_values[layer][field]\n",
    "        df_list.append(row)\n",
    "        count += 1\n",
    "        if count%log_records == 0:\n",
    "          write_log(f'Packets appended to DataFrame: {count}')\n",
    "\n",
    "    df = pd.DataFrame(df_list)\n",
    "\n",
    "    write_log(f'------------ All Packets appended to the DataFrame {chunk} ------------')\n",
    "\n",
    "    df = pd.concat([main_df, df], axis=1)\n",
    "    df.to_csv(f'./UNSW/output{chunk}.csv', index=False)\n",
    "\n",
    "    write_log(f'------------ DataFrame {chunk} saved to CSV File ------------')\n",
    "    \n",
    "    chunk += 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "357a9e64-865d-487a-9aca-c3cac4d8d194",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Process CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6b54b2e-e5d2-459e-8954-09ed3b44f3af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (1,3,5,8,9,10,13,16,18,19,22,27,28,29,30,31,32,37,39,41,42,43,44,45,47,50,55,58,63,64,65,66,67,68,69,70,71,72,74,75,76,77,78,81,82,83,84,85,86,88,90,91,92,93,94,95,96,97,100,101,102,108,111,114,116,117,122,124,127,130,133,136,137,138,139,141,143,144,147,150,151,152,154,156,157,158,159,160,161,162,163,164,166,167,168,171,172,176,177,178,179,180,182,184,185,186,187,189,193,198,203,204,205,206,207,208,211,213,214,215,216,217,218,219,220,221,226,227,228,230,231,234,236,238,239,248,253,254,255,257,259,260,262,263,264,265,266,267,271,272,276,279,280,281,283,284,291,292,293,302,304,305,306,307,310,311,312,314,316,317,318,319,322,325,328,335,336,339,340,341,344,346,347,355,356,359,360,363,364,366,367,368,369,370,382,383,384,385,386,387,390,391,392,393,394,395,396,398,399,400,401,402,407,413,414,415,416,417,418,420,421,424) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (1,3,5,8,9,10,13,16,18,19,22,25,28,29,30,31,32,33,38,40,42,43,44,45,46,48,51,56,59,64,65,66,67,68,69,70,71,72,75,76,77,83,84,85,86,87,88,91,92,93,94,95,97,98,99,100,101,102,103,104,105,106,107,108,110,111,113,116,117,118,119,121,122,123,124,125,126,127,132,137,138,139,140,142,143,146,147,149,150,151,152,154,155,156,158,159,160,162,164,165,166,167,169,170,172,173,174,176,178,180,181,183,184,186,189,191,193,195,198,201,204,207,209,212,215,217,218,219,220,222,225,226,230,233,234,235,238,240,241,242,243,244,245,246,247,248,249,250,251,252,255,256,259,260,263,266,267,268,269,270,271,276,279,280,281,285,287,288,290,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,312,314,317,320,323,326,329,330,331,332,333,336,341,344,346,349,351,353,354,355,356,362,363,366,368,370,371,372,373,376) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "for i in range(13,19):\n",
    "    if not os.path.isfile(f'./UNSW/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>')\n",
    "    df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
    "    if df.shape[0]>10_000_000 or i==18:\n",
    "        write_log(f'------------ Misaligned CSV File {i} Processing ------------')\n",
    "        rows = df.shape[0]//2\n",
    "        df1 = df.iloc[rows:,:16]\n",
    "        df2 = df.iloc[:rows,16:]\n",
    "        df = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)\n",
    "    df.drop_duplicates(inplace=True)\n",
    "    write_log(f'------------ CSV File {i} Processed. Final shape is {df.shape} ------------')\n",
    "    df.to_csv(f'./UNSW/output{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Overwritten and Saved ------------')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2111ca8-7bd5-4874-94f8-d4e0332f4a46",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Sampling Packets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0d95ce37-b153-4bf4-8474-cbc780fbf9c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('UNSW.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8405cca9-3615-418f-b123-871e7be62be7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>payload</th>\n",
       "      <th>t_delta</th>\n",
       "      <th>stime_flow</th>\n",
       "      <th>attack_cat</th>\n",
       "      <th>label</th>\n",
       "      <th>ltime_max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1421927377</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1500...</td>\n",
       "      <td>0201002cc0a8f1f30000000089d8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1421927377</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045c00040ef1400...</td>\n",
       "      <td>0201002cc0a8f1f300000000ead8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1421927387</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045c00040ef1600...</td>\n",
       "      <td>0201002cc0a8f1f300000000ead8000000000000000000...</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1421927387</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1700...</td>\n",
       "      <td>0201002cc0a8f1f30000000089d8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1421927397</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1900...</td>\n",
       "      <td>0201002cc0a8f1f30000000089d8000000000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421927e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759341</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc6d5e40...</td>\n",
       "      <td>f8d7033e6b99c39e74a4671e7fe282b3d233e5647a2124...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759342</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dc6d5f40...</td>\n",
       "      <td>6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759343</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc6d5f40...</td>\n",
       "      <td>6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759344</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc6d5840...</td>\n",
       "      <td>ddb752b3de517a85324680f2342a51a774932569685b10...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179759345</th>\n",
       "      <td>1424262069</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>38606</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450000346c6040...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424262e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>179759346 rows × 16 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                stime          srcip  sport       dstip  dsport protocol_m  \\\n",
       "0          1421927377    10.40.182.1      0   224.0.0.5       0       ospf   \n",
       "1          1421927377     10.40.85.1      0   224.0.0.5       0       ospf   \n",
       "2          1421927387     10.40.85.1      0   224.0.0.5       0       ospf   \n",
       "3          1421927387    10.40.182.1      0   224.0.0.5       0       ospf   \n",
       "4          1421927397    10.40.182.1      0   224.0.0.5       0       ospf   \n",
       "...               ...            ...    ...         ...     ...        ...   \n",
       "179759341  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759342  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759343  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759344  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "179759345  1424262069  149.171.126.9     80  59.166.0.1   38606        tcp   \n",
       "\n",
       "           sttl  total_len   first_layer  \\\n",
       "0             1         64  cooked linux   \n",
       "1             1         64  cooked linux   \n",
       "2             1         64  cooked linux   \n",
       "3             1         64  cooked linux   \n",
       "4             1         64  cooked linux   \n",
       "...         ...        ...           ...   \n",
       "179759341    30       1500  cooked linux   \n",
       "179759342    29       1500  cooked linux   \n",
       "179759343    30       1500  cooked linux   \n",
       "179759344    30       1500  cooked linux   \n",
       "179759345    29         52  cooked linux   \n",
       "\n",
       "                                                      packet  \\\n",
       "0          000400010006005056a524c20000080045c00040ef1500...   \n",
       "1          000400010006005056a577630000080045c00040ef1400...   \n",
       "2          000400010006005056a577630000080045c00040ef1600...   \n",
       "3          000400010006005056a524c20000080045c00040ef1700...   \n",
       "4          000400010006005056a524c20000080045c00040ef1900...   \n",
       "...                                                      ...   \n",
       "179759341  000000010006001b17059e1c00000800450005dc6d5e40...   \n",
       "179759342  000400010006005056a5776300000800450005dc6d5f40...   \n",
       "179759343  000000010006001b17059e1c00000800450005dc6d5f40...   \n",
       "179759344  000000010006001b17059e1c00000800450005dc6d5840...   \n",
       "179759345  000400010006005056a5776300000800450000346c6040...   \n",
       "\n",
       "                                                     payload  t_delta  \\\n",
       "0          0201002cc0a8f1f30000000089d8000000000000000000...      0.0   \n",
       "1          0201002cc0a8f1f300000000ead8000000000000000000...      0.0   \n",
       "2          0201002cc0a8f1f300000000ead8000000000000000000...      6.0   \n",
       "3          0201002cc0a8f1f30000000089d8000000000000000000...      0.0   \n",
       "4          0201002cc0a8f1f30000000089d8000000000000000000...      0.0   \n",
       "...                                                      ...      ...   \n",
       "179759341  f8d7033e6b99c39e74a4671e7fe282b3d233e5647a2124...      0.0   \n",
       "179759342  6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...      0.0   \n",
       "179759343  6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...      0.0   \n",
       "179759344  ddb752b3de517a85324680f2342a51a774932569685b10...      0.0   \n",
       "179759345                                                NaN      0.0   \n",
       "\n",
       "             stime_flow attack_cat  label     ltime_max  \n",
       "0          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "1          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "2          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "3          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "4          1.421927e+09     normal    0.0  1.421927e+09  \n",
       "...                 ...        ...    ...           ...  \n",
       "179759341  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759342  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759343  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759344  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "179759345  1.424262e+09     normal    0.0  1.424262e+09  \n",
       "\n",
       "[179759346 rows x 16 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4fdcfa75-1833-4d25-85f6-91bd4145025f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            175822483\n",
       "exploits            2317277\n",
       "dos                  656620\n",
       "fuzzers              421364\n",
       "generic              333069\n",
       "reconnaissance       150365\n",
       "backdoor              16370\n",
       "analysis              14183\n",
       "shellcode             13983\n",
       "worms                 13632\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "67abd5fb-f311-48db-b599-d462bc1aa87f",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = df['attack_cat'].unique().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6d4437de-8170-409a-8e7d-7a13c99a8789",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['normal',\n",
       " 'reconnaissance',\n",
       " 'exploits',\n",
       " 'dos',\n",
       " 'generic',\n",
       " 'shellcode',\n",
       " 'fuzzers',\n",
       " 'worms',\n",
       " 'backdoor',\n",
       " 'analysis']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "93fe77bb-4e9a-4d1f-aad7-39288068c18f",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df = pd.DataFrame(columns = df.columns)\n",
    "for label in labels:\n",
    "  try:\n",
    "    if label == 'normal':\n",
    "        final_df = final_df.append(df[(df['attack_cat']==label) & (label=='normal') & (df['payload'].notnull())].sample(5400, replace=False), ignore_index=True)\n",
    "    else:\n",
    "        final_df = final_df.append(df[(df['attack_cat']==label) & (df['payload'].notnull())].sample(600, replace=False), ignore_index=True)\n",
    "  except ValueError as Error:\n",
    "    final_df = final_df.append(df[(df['attack_cat']==label) & (df['payload'].notnull())].sample(600, replace=True), ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4e7df4de-16bc-472a-baa1-0d6995c2af11",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>payload</th>\n",
       "      <th>t_delta</th>\n",
       "      <th>stime_flow</th>\n",
       "      <th>attack_cat</th>\n",
       "      <th>label</th>\n",
       "      <th>ltime_max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1424229994</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>37207</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcd67040...</td>\n",
       "      <td>a9e203e374664b68d3d6ee23d2800acfb211df830ca3ae...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424230e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424230e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1421928284</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>56689</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>120</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500007834f240...</td>\n",
       "      <td>13426974546f7272656e742070726f746f636f6c000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421928e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421928e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1424239875</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>64300</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>16137</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>90</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500005ad68a40...</td>\n",
       "      <td>e32100000047cf2a8bebb3412359f087ea49aefa1623d5...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424240e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424240e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1421944419</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>13425</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059439040...</td>\n",
       "      <td>91d4818000010001000000000f7365727665722d393561...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421944e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421944e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1421933389</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>22</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>53586</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>116</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000074ab7b40...</td>\n",
       "      <td>c3431e5e00280a1a99940ecfd83defc7f1b300182e684c...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>normal</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.421933e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10795</th>\n",
       "      <td>1424239582</td>\n",
       "      <td>149.171.126.11</td>\n",
       "      <td>80</td>\n",
       "      <td>175.45.176.3</td>\n",
       "      <td>31557</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253</td>\n",
       "      <td>425</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450001a990e700...</td>\n",
       "      <td>485454502f312e3120323030204f4b0d0a446174653a20...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424240e+09</td>\n",
       "      <td>analysis</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.424240e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10796</th>\n",
       "      <td>1424249492</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>0</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>254</td>\n",
       "      <td>108</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500006cc27200...</td>\n",
       "      <td>020100586ec85ed86ec85ed0bd0700005c7c002116fb15...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424249e+09</td>\n",
       "      <td>analysis</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.424250e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10797</th>\n",
       "      <td>1424249610</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>0</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>255</td>\n",
       "      <td>108</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500006cc28d00...</td>\n",
       "      <td>020100586ec85ed86ec85ed011eb0002a9a1ff615726a0...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424250e+09</td>\n",
       "      <td>analysis</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.424250e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10798</th>\n",
       "      <td>1424252908</td>\n",
       "      <td>175.45.176.2</td>\n",
       "      <td>0</td>\n",
       "      <td>149.171.126.16</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>255</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000040e97e00...</td>\n",
       "      <td>0201002c6ec85ed86ec85ed00a59000127000000000000...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424253e+09</td>\n",
       "      <td>analysis</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.424253e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10799</th>\n",
       "      <td>1424232871</td>\n",
       "      <td>149.171.126.16</td>\n",
       "      <td>80</td>\n",
       "      <td>175.45.176.3</td>\n",
       "      <td>47966</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253</td>\n",
       "      <td>40</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000028ea2a00...</td>\n",
       "      <td>000000000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.424233e+09</td>\n",
       "      <td>analysis</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.424233e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10800 rows × 16 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            stime           srcip  sport           dstip dsport protocol_m  \\\n",
       "0      1424229994   149.171.126.3   6881      59.166.0.8  37207        tcp   \n",
       "1      1421928284   149.171.126.8   6881      59.166.0.6  56689        tcp   \n",
       "2      1424239875      59.166.0.8  64300   149.171.126.9  16137        tcp   \n",
       "3      1421944419   149.171.126.0     53      59.166.0.5  13425        udp   \n",
       "4      1421933389   149.171.126.6     22      59.166.0.2  53586        tcp   \n",
       "...           ...             ...    ...             ...    ...        ...   \n",
       "10795  1424239582  149.171.126.11     80    175.45.176.3  31557        tcp   \n",
       "10796  1424249492    175.45.176.1      0  149.171.126.15      0       ospf   \n",
       "10797  1424249610    175.45.176.1      0  149.171.126.15      0       ospf   \n",
       "10798  1424252908    175.45.176.2      0  149.171.126.16      0       ospf   \n",
       "10799  1424232871  149.171.126.16     80    175.45.176.3  47966        tcp   \n",
       "\n",
       "      sttl total_len   first_layer  \\\n",
       "0       30      1500  cooked linux   \n",
       "1       29       120  cooked linux   \n",
       "2       32        90  cooked linux   \n",
       "3       29        89  cooked linux   \n",
       "4       29       116  cooked linux   \n",
       "...    ...       ...           ...   \n",
       "10795  253       425  cooked linux   \n",
       "10796  254       108  cooked linux   \n",
       "10797  255       108  cooked linux   \n",
       "10798  255        64  cooked linux   \n",
       "10799  253        40  cooked linux   \n",
       "\n",
       "                                                  packet  \\\n",
       "0      000000010006001b17059e1c00000800450005dcd67040...   \n",
       "1      000400010006005056a57763000008004500007834f240...   \n",
       "2      000000010006021ac5000000000008004500005ad68a40...   \n",
       "3      000400010006005056a577630000080045000059439040...   \n",
       "4      000400010006005056a577630000080045000074ab7b40...   \n",
       "...                                                  ...   \n",
       "10795  000000010006001b17059e1c00000800450001a990e700...   \n",
       "10796  000400010006005056a524c2000008004500006cc27200...   \n",
       "10797  000000010006021ac5000000000008004500006cc28d00...   \n",
       "10798  000000010006021ac50000000000080045000040e97e00...   \n",
       "10799  000000010006001b17059e1c0000080045000028ea2a00...   \n",
       "\n",
       "                                                 payload  t_delta  \\\n",
       "0      a9e203e374664b68d3d6ee23d2800acfb211df830ca3ae...      0.0   \n",
       "1      13426974546f7272656e742070726f746f636f6c000000...      0.0   \n",
       "2      e32100000047cf2a8bebb3412359f087ea49aefa1623d5...      0.0   \n",
       "3      91d4818000010001000000000f7365727665722d393561...      0.0   \n",
       "4      c3431e5e00280a1a99940ecfd83defc7f1b300182e684c...      0.0   \n",
       "...                                                  ...      ...   \n",
       "10795  485454502f312e3120323030204f4b0d0a446174653a20...      0.0   \n",
       "10796  020100586ec85ed86ec85ed0bd0700005c7c002116fb15...      0.0   \n",
       "10797  020100586ec85ed86ec85ed011eb0002a9a1ff615726a0...      0.0   \n",
       "10798  0201002c6ec85ed86ec85ed00a59000127000000000000...      0.0   \n",
       "10799                                       000000000000      0.0   \n",
       "\n",
       "         stime_flow attack_cat  label     ltime_max  \n",
       "0      1.424230e+09     normal    0.0  1.424230e+09  \n",
       "1      1.421928e+09     normal    0.0  1.421928e+09  \n",
       "2      1.424240e+09     normal    0.0  1.424240e+09  \n",
       "3      1.421944e+09     normal    0.0  1.421944e+09  \n",
       "4      1.421933e+09     normal    0.0  1.421933e+09  \n",
       "...             ...        ...    ...           ...  \n",
       "10795  1.424240e+09   analysis    1.0  1.424240e+09  \n",
       "10796  1.424249e+09   analysis    1.0  1.424250e+09  \n",
       "10797  1.424250e+09   analysis    1.0  1.424250e+09  \n",
       "10798  1.424253e+09   analysis    1.0  1.424253e+09  \n",
       "10799  1.424233e+09   analysis    1.0  1.424233e+09  \n",
       "\n",
       "[10800 rows x 16 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a72b96a4-f313-4e5c-8c91-cccd8bcecf7e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            5400\n",
       "reconnaissance     600\n",
       "exploits           600\n",
       "dos                600\n",
       "generic            600\n",
       "shellcode          600\n",
       "fuzzers            600\n",
       "worms              600\n",
       "backdoor           600\n",
       "analysis           600\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "72f963dd-0d65-4f78-9c8a-5d10840ea10b",
   "metadata": {},
   "outputs": [],
   "source": [
    "packet_info = []\n",
    "packet_details = []\n",
    "\n",
    "for i in range(0, len(final_df)):\n",
    "  packet_bytes = bytes.fromhex(final_df.iloc[i,9])\n",
    "  if final_df.iloc[i,8] == 'cooked linux':\n",
    "    packet = CookedLinux(packet_bytes)\n",
    "  elif final_df.iloc[i,8] == 'Ethernet':\n",
    "    packet = Ether(packet_bytes)\n",
    "  else:\n",
    "    print('Error -> First Layer is not valid')\n",
    "  details = packet.show(dump=True)\n",
    "  packet_info.append(details)\n",
    "\n",
    "for packet in packet_info:\n",
    "  fields_values = {}\n",
    "  current_layer = \"\"\n",
    "  for line in packet.split(\"\\n\"):\n",
    "      if line.startswith(\"###[\") and \"]\" in line:\n",
    "          current_layer = line.split(\"]\")[0].split(\"[\")[1].strip()\n",
    "          fields_values[current_layer] = {}\n",
    "      elif current_layer != \"\":\n",
    "          matches = re.findall(r\"\\s+([a-z_]+)\\s+=\\s+(.+)\", line)\n",
    "          for match in matches:\n",
    "              field_name = match[0]\n",
    "              field_value = match[1]\n",
    "              fields_values[current_layer][field_name] = field_value\n",
    "  packet_details.append(fields_values)\n",
    "\n",
    "df1 = pd.DataFrame()\n",
    "df_list = []\n",
    "for fields_values in packet_details:\n",
    "    row = {}\n",
    "    for layer, fields in fields_values.items():\n",
    "        for field in fields:\n",
    "            column_name = f\"{layer} {field}\"\n",
    "            row[column_name] = fields_values[layer][field]\n",
    "    df_list.append(row)\n",
    "df1 = pd.concat([pd.DataFrame(df_list[i], index=[i]) for i in range(len(df_list))], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "08f2c21b-f9bf-408f-a3fe-4498be83c8fd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cooked linux pkttype</th>\n",
       "      <th>cooked linux lladdrlen</th>\n",
       "      <th>cooked linux src</th>\n",
       "      <th>cooked linux proto</th>\n",
       "      <th>IP version</th>\n",
       "      <th>IP ihl</th>\n",
       "      <th>IP tos</th>\n",
       "      <th>IP len</th>\n",
       "      <th>IP id</th>\n",
       "      <th>IP flags</th>\n",
       "      <th>...</th>\n",
       "      <th>SCTPChunkData type</th>\n",
       "      <th>SCTPChunkData reserved</th>\n",
       "      <th>SCTPChunkData unordered</th>\n",
       "      <th>SCTPChunkData beginning</th>\n",
       "      <th>SCTPChunkData ending</th>\n",
       "      <th>SCTPChunkData len</th>\n",
       "      <th>SCTPChunkData tsn</th>\n",
       "      <th>SCTPChunkData stream_id</th>\n",
       "      <th>SCTPChunkData proto_id</th>\n",
       "      <th>SCTPChunkData data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>unicast</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00\\x1b\\x17\\x05\\\\x9e\\x1c'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>1500</td>\n",
       "      <td>54896</td>\n",
       "      <td>DF</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sent-by-us</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00PV\\\\xa5wc'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>120</td>\n",
       "      <td>13554</td>\n",
       "      <td>DF</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>unicast</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x02\\x1a\\\\xc5'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>90</td>\n",
       "      <td>54922</td>\n",
       "      <td>DF</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sent-by-us</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00PV\\\\xa5wc'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>89</td>\n",
       "      <td>17296</td>\n",
       "      <td>DF</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sent-by-us</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00PV\\\\xa5wc'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>116</td>\n",
       "      <td>43899</td>\n",
       "      <td>DF</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10795</th>\n",
       "      <td>unicast</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00\\x1b\\x17\\x05\\\\x9e\\x1c'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>425</td>\n",
       "      <td>37095</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10796</th>\n",
       "      <td>sent-by-us</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00PV\\\\xa5$\\\\xc2'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>108</td>\n",
       "      <td>49778</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10797</th>\n",
       "      <td>unicast</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x02\\x1a\\\\xc5'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>108</td>\n",
       "      <td>49805</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10798</th>\n",
       "      <td>unicast</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x02\\x1a\\\\xc5'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>64</td>\n",
       "      <td>59774</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10799</th>\n",
       "      <td>unicast</td>\n",
       "      <td>6</td>\n",
       "      <td>'\\x00\\x1b\\x17\\x05\\\\x9e\\x1c'</td>\n",
       "      <td>IPv4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0x0</td>\n",
       "      <td>40</td>\n",
       "      <td>59946</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10800 rows × 194 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      cooked linux pkttype cooked linux lladdrlen  \\\n",
       "0                  unicast                      6   \n",
       "1               sent-by-us                      6   \n",
       "2                  unicast                      6   \n",
       "3               sent-by-us                      6   \n",
       "4               sent-by-us                      6   \n",
       "...                    ...                    ...   \n",
       "10795              unicast                      6   \n",
       "10796           sent-by-us                      6   \n",
       "10797              unicast                      6   \n",
       "10798              unicast                      6   \n",
       "10799              unicast                      6   \n",
       "\n",
       "                  cooked linux src cooked linux proto IP version IP ihl  \\\n",
       "0      '\\x00\\x1b\\x17\\x05\\\\x9e\\x1c'               IPv4          4      5   \n",
       "1                  '\\x00PV\\\\xa5wc'               IPv4          4      5   \n",
       "2                  '\\x02\\x1a\\\\xc5'               IPv4          4      5   \n",
       "3                  '\\x00PV\\\\xa5wc'               IPv4          4      5   \n",
       "4                  '\\x00PV\\\\xa5wc'               IPv4          4      5   \n",
       "...                            ...                ...        ...    ...   \n",
       "10795  '\\x00\\x1b\\x17\\x05\\\\x9e\\x1c'               IPv4          4      5   \n",
       "10796          '\\x00PV\\\\xa5$\\\\xc2'               IPv4          4      5   \n",
       "10797              '\\x02\\x1a\\\\xc5'               IPv4          4      5   \n",
       "10798              '\\x02\\x1a\\\\xc5'               IPv4          4      5   \n",
       "10799  '\\x00\\x1b\\x17\\x05\\\\x9e\\x1c'               IPv4          4      5   \n",
       "\n",
       "      IP tos IP len  IP id IP flags  ... SCTPChunkData type  \\\n",
       "0        0x0   1500  54896       DF  ...                NaN   \n",
       "1        0x0    120  13554       DF  ...                NaN   \n",
       "2        0x0     90  54922       DF  ...                NaN   \n",
       "3        0x0     89  17296       DF  ...                NaN   \n",
       "4        0x0    116  43899       DF  ...                NaN   \n",
       "...      ...    ...    ...      ...  ...                ...   \n",
       "10795    0x0    425  37095      NaN  ...                NaN   \n",
       "10796    0x0    108  49778      NaN  ...                NaN   \n",
       "10797    0x0    108  49805      NaN  ...                NaN   \n",
       "10798    0x0     64  59774      NaN  ...                NaN   \n",
       "10799    0x0     40  59946      NaN  ...                NaN   \n",
       "\n",
       "      SCTPChunkData reserved SCTPChunkData unordered SCTPChunkData beginning  \\\n",
       "0                        NaN                     NaN                     NaN   \n",
       "1                        NaN                     NaN                     NaN   \n",
       "2                        NaN                     NaN                     NaN   \n",
       "3                        NaN                     NaN                     NaN   \n",
       "4                        NaN                     NaN                     NaN   \n",
       "...                      ...                     ...                     ...   \n",
       "10795                    NaN                     NaN                     NaN   \n",
       "10796                    NaN                     NaN                     NaN   \n",
       "10797                    NaN                     NaN                     NaN   \n",
       "10798                    NaN                     NaN                     NaN   \n",
       "10799                    NaN                     NaN                     NaN   \n",
       "\n",
       "      SCTPChunkData ending SCTPChunkData len SCTPChunkData tsn  \\\n",
       "0                      NaN               NaN               NaN   \n",
       "1                      NaN               NaN               NaN   \n",
       "2                      NaN               NaN               NaN   \n",
       "3                      NaN               NaN               NaN   \n",
       "4                      NaN               NaN               NaN   \n",
       "...                    ...               ...               ...   \n",
       "10795                  NaN               NaN               NaN   \n",
       "10796                  NaN               NaN               NaN   \n",
       "10797                  NaN               NaN               NaN   \n",
       "10798                  NaN               NaN               NaN   \n",
       "10799                  NaN               NaN               NaN   \n",
       "\n",
       "      SCTPChunkData stream_id SCTPChunkData proto_id SCTPChunkData data  \n",
       "0                         NaN                    NaN                NaN  \n",
       "1                         NaN                    NaN                NaN  \n",
       "2                         NaN                    NaN                NaN  \n",
       "3                         NaN                    NaN                NaN  \n",
       "4                         NaN                    NaN                NaN  \n",
       "...                       ...                    ...                ...  \n",
       "10795                     NaN                    NaN                NaN  \n",
       "10796                     NaN                    NaN                NaN  \n",
       "10797                     NaN                    NaN                NaN  \n",
       "10798                     NaN                    NaN                NaN  \n",
       "10799                     NaN                    NaN                NaN  \n",
       "\n",
       "[10800 rows x 194 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3c561309-e081-4185-9675-648d49c780df",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.concat([final_df, df1], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "187ca49e-2329-4889-8416-e6dea62655d4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>SCTPChunkData type</th>\n",
       "      <th>SCTPChunkData reserved</th>\n",
       "      <th>SCTPChunkData unordered</th>\n",
       "      <th>SCTPChunkData beginning</th>\n",
       "      <th>SCTPChunkData ending</th>\n",
       "      <th>SCTPChunkData len</th>\n",
       "      <th>SCTPChunkData tsn</th>\n",
       "      <th>SCTPChunkData stream_id</th>\n",
       "      <th>SCTPChunkData proto_id</th>\n",
       "      <th>SCTPChunkData data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1424229994</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>37207</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcd67040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1421928284</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>56689</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>120</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500007834f240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1424239875</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>64300</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>16137</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>90</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500005ad68a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1421944419</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>13425</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059439040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1421933389</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>22</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>53586</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>116</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000074ab7b40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10795</th>\n",
       "      <td>1424239582</td>\n",
       "      <td>149.171.126.11</td>\n",
       "      <td>80</td>\n",
       "      <td>175.45.176.3</td>\n",
       "      <td>31557</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253</td>\n",
       "      <td>425</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450001a990e700...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10796</th>\n",
       "      <td>1424249492</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>0</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>254</td>\n",
       "      <td>108</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500006cc27200...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10797</th>\n",
       "      <td>1424249610</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>0</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>255</td>\n",
       "      <td>108</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500006cc28d00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10798</th>\n",
       "      <td>1424252908</td>\n",
       "      <td>175.45.176.2</td>\n",
       "      <td>0</td>\n",
       "      <td>149.171.126.16</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>255</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000040e97e00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10799</th>\n",
       "      <td>1424232871</td>\n",
       "      <td>149.171.126.16</td>\n",
       "      <td>80</td>\n",
       "      <td>175.45.176.3</td>\n",
       "      <td>47966</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253</td>\n",
       "      <td>40</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000028ea2a00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10800 rows × 210 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            stime           srcip  sport           dstip dsport protocol_m  \\\n",
       "0      1424229994   149.171.126.3   6881      59.166.0.8  37207        tcp   \n",
       "1      1421928284   149.171.126.8   6881      59.166.0.6  56689        tcp   \n",
       "2      1424239875      59.166.0.8  64300   149.171.126.9  16137        tcp   \n",
       "3      1421944419   149.171.126.0     53      59.166.0.5  13425        udp   \n",
       "4      1421933389   149.171.126.6     22      59.166.0.2  53586        tcp   \n",
       "...           ...             ...    ...             ...    ...        ...   \n",
       "10795  1424239582  149.171.126.11     80    175.45.176.3  31557        tcp   \n",
       "10796  1424249492    175.45.176.1      0  149.171.126.15      0       ospf   \n",
       "10797  1424249610    175.45.176.1      0  149.171.126.15      0       ospf   \n",
       "10798  1424252908    175.45.176.2      0  149.171.126.16      0       ospf   \n",
       "10799  1424232871  149.171.126.16     80    175.45.176.3  47966        tcp   \n",
       "\n",
       "      sttl total_len   first_layer  \\\n",
       "0       30      1500  cooked linux   \n",
       "1       29       120  cooked linux   \n",
       "2       32        90  cooked linux   \n",
       "3       29        89  cooked linux   \n",
       "4       29       116  cooked linux   \n",
       "...    ...       ...           ...   \n",
       "10795  253       425  cooked linux   \n",
       "10796  254       108  cooked linux   \n",
       "10797  255       108  cooked linux   \n",
       "10798  255        64  cooked linux   \n",
       "10799  253        40  cooked linux   \n",
       "\n",
       "                                                  packet  ...  \\\n",
       "0      000000010006001b17059e1c00000800450005dcd67040...  ...   \n",
       "1      000400010006005056a57763000008004500007834f240...  ...   \n",
       "2      000000010006021ac5000000000008004500005ad68a40...  ...   \n",
       "3      000400010006005056a577630000080045000059439040...  ...   \n",
       "4      000400010006005056a577630000080045000074ab7b40...  ...   \n",
       "...                                                  ...  ...   \n",
       "10795  000000010006001b17059e1c00000800450001a990e700...  ...   \n",
       "10796  000400010006005056a524c2000008004500006cc27200...  ...   \n",
       "10797  000000010006021ac5000000000008004500006cc28d00...  ...   \n",
       "10798  000000010006021ac50000000000080045000040e97e00...  ...   \n",
       "10799  000000010006001b17059e1c0000080045000028ea2a00...  ...   \n",
       "\n",
       "      SCTPChunkData type  SCTPChunkData reserved  SCTPChunkData unordered  \\\n",
       "0                    NaN                     NaN                      NaN   \n",
       "1                    NaN                     NaN                      NaN   \n",
       "2                    NaN                     NaN                      NaN   \n",
       "3                    NaN                     NaN                      NaN   \n",
       "4                    NaN                     NaN                      NaN   \n",
       "...                  ...                     ...                      ...   \n",
       "10795                NaN                     NaN                      NaN   \n",
       "10796                NaN                     NaN                      NaN   \n",
       "10797                NaN                     NaN                      NaN   \n",
       "10798                NaN                     NaN                      NaN   \n",
       "10799                NaN                     NaN                      NaN   \n",
       "\n",
       "      SCTPChunkData beginning  SCTPChunkData ending  SCTPChunkData len  \\\n",
       "0                         NaN                   NaN                NaN   \n",
       "1                         NaN                   NaN                NaN   \n",
       "2                         NaN                   NaN                NaN   \n",
       "3                         NaN                   NaN                NaN   \n",
       "4                         NaN                   NaN                NaN   \n",
       "...                       ...                   ...                ...   \n",
       "10795                     NaN                   NaN                NaN   \n",
       "10796                     NaN                   NaN                NaN   \n",
       "10797                     NaN                   NaN                NaN   \n",
       "10798                     NaN                   NaN                NaN   \n",
       "10799                     NaN                   NaN                NaN   \n",
       "\n",
       "      SCTPChunkData tsn SCTPChunkData stream_id SCTPChunkData proto_id  \\\n",
       "0                   NaN                     NaN                    NaN   \n",
       "1                   NaN                     NaN                    NaN   \n",
       "2                   NaN                     NaN                    NaN   \n",
       "3                   NaN                     NaN                    NaN   \n",
       "4                   NaN                     NaN                    NaN   \n",
       "...                 ...                     ...                    ...   \n",
       "10795               NaN                     NaN                    NaN   \n",
       "10796               NaN                     NaN                    NaN   \n",
       "10797               NaN                     NaN                    NaN   \n",
       "10798               NaN                     NaN                    NaN   \n",
       "10799               NaN                     NaN                    NaN   \n",
       "\n",
       "      SCTPChunkData data  \n",
       "0                    NaN  \n",
       "1                    NaN  \n",
       "2                    NaN  \n",
       "3                    NaN  \n",
       "4                    NaN  \n",
       "...                  ...  \n",
       "10795                NaN  \n",
       "10796                NaN  \n",
       "10797                NaN  \n",
       "10798                NaN  \n",
       "10799                NaN  \n",
       "\n",
       "[10800 rows x 210 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f0d83fe1-8e92-4d23-8c5f-897dd0c1ab04",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "stime                                                            1424227938\n",
       "srcip                                                          175.45.176.2\n",
       "sport                                                                 58075\n",
       "dstip                                                        149.171.126.17\n",
       "dsport                                                                 7002\n",
       "protocol_m                                                              tcp\n",
       "sttl                                                                    255\n",
       "total_len                                                                40\n",
       "first_layer                                                    cooked linux\n",
       "packet                    000000010006021ac50000000000080045000028087b00...\n",
       "payload                                                        3d30303d3030\n",
       "t_delta                                                                 0.0\n",
       "stime_flow                                                     1424227932.0\n",
       "attack_cat                                                          generic\n",
       "label                                                                   1.0\n",
       "ltime_max                                                      1424227940.0\n",
       "cooked linux pkttype                                                unicast\n",
       "cooked linux lladdrlen                                                    6\n",
       "cooked linux src                                            '\\x02\\x1a\\\\xc5'\n",
       "cooked linux proto                                                     IPv4\n",
       "IP version                                                                4\n",
       "IP ihl                                                                    5\n",
       "IP tos                                                                  0x0\n",
       "IP len                                                                   40\n",
       "IP id                                                                  2171\n",
       "IP frag                                                                   0\n",
       "IP ttl                                                                  255\n",
       "IP proto                                                                tcp\n",
       "IP chksum                                                            0x4068\n",
       "IP src                                                         175.45.176.2\n",
       "IP dst                                                       149.171.126.17\n",
       "TCP sport                                                             58075\n",
       "TCP dport                                                     afs3_prserver\n",
       "TCP seq                                                          3888946798\n",
       "TCP ack                                                          2777783915\n",
       "TCP dataofs                                                               5\n",
       "TCP reserved                                                              0\n",
       "TCP flags                                                                 A\n",
       "TCP window                                                            16383\n",
       "TCP chksum                                                           0x347a\n",
       "TCP urgptr                                                                0\n",
       "TCP options                                                              ''\n",
       "Padding load                                                       '=00=00'\n",
       "Name: 7325, dtype: object"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.iloc[7325].dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c694f9b5-afae-4ba2-8eeb-5df4e99e54c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.to_csv('UNSW_1000_ANM.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60fcc0d7-c78f-4c05-b744-2a1a94098681",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Sample Records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "50782601-d920-4453-b32d-bca66c049fe8",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_4155335/420502805.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      3\u001b[0m         \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0mwrite_log\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'./UNSW/output{i}.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'protocol_m'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'tcp'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'protocol_m'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'udp'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'payload'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    209\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m                     \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    213\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m                     \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    330\u001b[0m                 )\n\u001b[0;32m--> 331\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    333\u001b[0m         \u001b[0;31m# error: \"Callable[[VarArg(Any), KwArg(Any)], Any]\" has no\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    948\u001b[0m     \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    610\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 611\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    612\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m   1776\u001b[0m                     \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1777\u001b[0m                     \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1778\u001b[0;31m                 \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m  \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1779\u001b[0m                     \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1780\u001b[0m                 )\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m    228\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    229\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlow_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m                 \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_low_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    231\u001b[0m                 \u001b[0;31m# destructive to chunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    232\u001b[0m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_concatenate_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/dtypes/common.py\u001b[0m in \u001b[0;36mis_extension_array_dtype\u001b[0;34m(arr_or_dtype)\u001b[0m\n\u001b[1;32m   1431\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1433\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mis_extension_array_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr_or_dtype\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mbool\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1434\u001b[0m     \"\"\"\n\u001b[1;32m   1435\u001b[0m     \u001b[0mCheck\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0man\u001b[0m \u001b[0mobject\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0ma\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0mextension\u001b[0m \u001b[0marray\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "for i in range(0,20):\n",
    "    if not os.path.isfile(f'./UNSW/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>')\n",
    "    df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
    "    df = df[(df['protocol_m'] == 'tcp') | (df['protocol_m'] == 'udp')]\n",
    "    df = df.dropna(subset=['payload'])\n",
    "    df = df.dropna(axis=1, how='all')\n",
    "    write_log(f'------------ CSV File {i} Processed. Final shape is {df.shape} ------------')\n",
    "    df.to_csv(f'./UNSW/UNSW-1/output{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Saved ------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "818e1e0c-1ef5-4a12-a81b-232f5f83eed3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (33,34,39,44,47,52,53,54,55,56,57,59,60,61,62,63,64,74,75,78,79,80,81,82,83,84,85,86,88,90,91,92,93,94,95,96,98,99,100,103,105,106,107,108,109,110,111,112,113,115,116,117,118,123,125,128,131,134,137,140,141,142,143,144,150,157,158,164,168,171,174,175,176,177,179,181,182,187,188,191,193,194,195,197,200,201,203,204,205,206,208,210,211,212,213,215,217,218,222,224,225,230,236,237,238,239,240,243,245,246,247,248,249,250,251,252,253,257,258,261,263,265,266,281,282,283,286,287,288,289,290,291,296,299,300,301,302,303,308,309,310,311,312,315,316,317,318,319,320,321,323,324,326,328,329,330,331,338,339,342,343,344,345,347,350,352,353,361,362,363,364,365,366,368,369,378,383,384,385,386) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('UNSW/output1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "60aaf93f-d35b-4420-b19b-d804d1964181",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>HSRP reserved</th>\n",
       "      <th>HSRP auth</th>\n",
       "      <th>HSRP MD5 Authentication type</th>\n",
       "      <th>HSRP MD5 Authentication len</th>\n",
       "      <th>HSRP MD5 Authentication algo</th>\n",
       "      <th>HSRP MD5 Authentication padding</th>\n",
       "      <th>HSRP MD5 Authentication flags</th>\n",
       "      <th>HSRP MD5 Authentication sourceip</th>\n",
       "      <th>HSRP MD5 Authentication keyid</th>\n",
       "      <th>SCTPChunkInit addr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1421927377</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1421927377</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045c00040ef1400...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1421927387</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045c00040ef1600...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1421927387</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1700...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1421927397</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045c00040ef1900...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999995</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003408ca40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999996</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003408ca40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999997</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003408cb40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999999</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>31666</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c200000800450000342d6c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000000 rows × 387 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime        srcip  sport          dstip  dsport protocol_m  \\\n",
       "0        1421927377  10.40.182.1      0      224.0.0.5       0       ospf   \n",
       "1        1421927377   10.40.85.1      0      224.0.0.5       0       ospf   \n",
       "2        1421927387   10.40.85.1      0      224.0.0.5       0       ospf   \n",
       "3        1421927387  10.40.182.1      0      224.0.0.5       0       ospf   \n",
       "4        1421927397  10.40.182.1      0      224.0.0.5       0       ospf   \n",
       "...             ...          ...    ...            ...     ...        ...   \n",
       "9999995  1421932056   59.166.0.3   6103  149.171.126.4   52633        tcp   \n",
       "9999996  1421932056   59.166.0.3   6103  149.171.126.4   52633        tcp   \n",
       "9999997  1421932056   59.166.0.3   6103  149.171.126.4   52633        tcp   \n",
       "9999998  1421932056   59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999999  1421932056   59.166.0.3  31666  149.171.126.5      22        tcp   \n",
       "\n",
       "         sttl  total_len   first_layer  \\\n",
       "0           1         64  cooked linux   \n",
       "1           1         64  cooked linux   \n",
       "2           1         64  cooked linux   \n",
       "3           1         64  cooked linux   \n",
       "4           1         64  cooked linux   \n",
       "...       ...        ...           ...   \n",
       "9999995    31         52  cooked linux   \n",
       "9999996    32         52  cooked linux   \n",
       "9999997    31         52  cooked linux   \n",
       "9999998    32        668  cooked linux   \n",
       "9999999    31         52  cooked linux   \n",
       "\n",
       "                                                    packet  ... HSRP reserved  \\\n",
       "0        000400010006005056a524c20000080045c00040ef1500...  ...           NaN   \n",
       "1        000400010006005056a577630000080045c00040ef1400...  ...           NaN   \n",
       "2        000400010006005056a577630000080045c00040ef1600...  ...           NaN   \n",
       "3        000400010006005056a524c20000080045c00040ef1700...  ...           NaN   \n",
       "4        000400010006005056a524c20000080045c00040ef1900...  ...           NaN   \n",
       "...                                                    ...  ...           ...   \n",
       "9999995  000400010006005056a524c2000008004500003408ca40...  ...           NaN   \n",
       "9999996  000000010006021ac5000000000008004500003408ca40...  ...           NaN   \n",
       "9999997  000400010006005056a524c2000008004500003408cb40...  ...           NaN   \n",
       "9999998  000000010006021ac5000000000008004500029cc1f040...  ...           NaN   \n",
       "9999999  000400010006005056a524c200000800450000342d6c40...  ...           NaN   \n",
       "\n",
       "         HSRP auth  HSRP MD5 Authentication type HSRP MD5 Authentication len  \\\n",
       "0              NaN                           NaN                         NaN   \n",
       "1              NaN                           NaN                         NaN   \n",
       "2              NaN                           NaN                         NaN   \n",
       "3              NaN                           NaN                         NaN   \n",
       "4              NaN                           NaN                         NaN   \n",
       "...            ...                           ...                         ...   \n",
       "9999995        NaN                           NaN                         NaN   \n",
       "9999996        NaN                           NaN                         NaN   \n",
       "9999997        NaN                           NaN                         NaN   \n",
       "9999998        NaN                           NaN                         NaN   \n",
       "9999999        NaN                           NaN                         NaN   \n",
       "\n",
       "         HSRP MD5 Authentication algo  HSRP MD5 Authentication padding  \\\n",
       "0                                 NaN                              NaN   \n",
       "1                                 NaN                              NaN   \n",
       "2                                 NaN                              NaN   \n",
       "3                                 NaN                              NaN   \n",
       "4                                 NaN                              NaN   \n",
       "...                               ...                              ...   \n",
       "9999995                           NaN                              NaN   \n",
       "9999996                           NaN                              NaN   \n",
       "9999997                           NaN                              NaN   \n",
       "9999998                           NaN                              NaN   \n",
       "9999999                           NaN                              NaN   \n",
       "\n",
       "        HSRP MD5 Authentication flags  HSRP MD5 Authentication sourceip  \\\n",
       "0                                 NaN                               NaN   \n",
       "1                                 NaN                               NaN   \n",
       "2                                 NaN                               NaN   \n",
       "3                                 NaN                               NaN   \n",
       "4                                 NaN                               NaN   \n",
       "...                               ...                               ...   \n",
       "9999995                           NaN                               NaN   \n",
       "9999996                           NaN                               NaN   \n",
       "9999997                           NaN                               NaN   \n",
       "9999998                           NaN                               NaN   \n",
       "9999999                           NaN                               NaN   \n",
       "\n",
       "        HSRP MD5 Authentication keyid SCTPChunkInit addr  \n",
       "0                                 NaN                NaN  \n",
       "1                                 NaN                NaN  \n",
       "2                                 NaN                NaN  \n",
       "3                                 NaN                NaN  \n",
       "4                                 NaN                NaN  \n",
       "...                               ...                ...  \n",
       "9999995                           NaN                NaN  \n",
       "9999996                           NaN                NaN  \n",
       "9999997                           NaN                NaN  \n",
       "9999998                           NaN                NaN  \n",
       "9999999                           NaN                NaN  \n",
       "\n",
       "[10000000 rows x 387 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ff8e97c-1145-437a-bf8d-8809bec9d99b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tcp            9829540\n",
       "udp             164413\n",
       "ospf              2970\n",
       "arp               1525\n",
       "others             990\n",
       "sctp                64\n",
       "pim                 42\n",
       "any                 30\n",
       "icmp                20\n",
       "sep                 18\n",
       "ipv6                16\n",
       "sun-nd              14\n",
       "swipe               14\n",
       "mobile              14\n",
       "encap               10\n",
       "crtp                10\n",
       "etherip             10\n",
       "gmtp                10\n",
       "pnni                10\n",
       "snp                 10\n",
       "iplt                10\n",
       "fire                10\n",
       "crudp               10\n",
       "sccopmce            10\n",
       "pipe                10\n",
       "micp                10\n",
       "sps                 10\n",
       "fc                  10\n",
       "ib                  10\n",
       "aes-sp3-d           10\n",
       "rvd                 10\n",
       "ipip                10\n",
       "ax.25               10\n",
       "larp                10\n",
       "dgp                 10\n",
       "vmtp                10\n",
       "secure-vmtp         10\n",
       "gre                 10\n",
       "rsvp                10\n",
       "rdp                 10\n",
       "hmp                 10\n",
       "emcon               10\n",
       "nvp                 10\n",
       "pup                 10\n",
       "egp                 10\n",
       "ip                  10\n",
       "unas                10\n",
       "Name: protocol_m, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['protocol_m'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "6120bd7f-3c2a-4d2a-ad93-ddb004b3f5b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[(df['protocol_m'] == 'tcp') | (df['protocol_m'] == 'udp')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "a6c2a999-8bfb-42d0-a4d6-0332c41992ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>HSRP reserved</th>\n",
       "      <th>HSRP auth</th>\n",
       "      <th>HSRP MD5 Authentication type</th>\n",
       "      <th>HSRP MD5 Authentication len</th>\n",
       "      <th>HSRP MD5 Authentication algo</th>\n",
       "      <th>HSRP MD5 Authentication padding</th>\n",
       "      <th>HSRP MD5 Authentication flags</th>\n",
       "      <th>HSRP MD5 Authentication sourceip</th>\n",
       "      <th>HSRP MD5 Authentication keyid</th>\n",
       "      <th>SCTPChunkInit addr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.16</td>\n",
       "      <td>80</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>13284</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252</td>\n",
       "      <td>48</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000030d80700...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.16</td>\n",
       "      <td>80</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>13284</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253</td>\n",
       "      <td>48</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000030d80700...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999995</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003408ca40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999996</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003408ca40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999997</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>6103</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>52633</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003408cb40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999999</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>31666</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c200000800450000342d6c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9993953 rows × 387 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime           srcip  sport          dstip  dsport protocol_m  \\\n",
       "8        1421927414   149.171.126.0     53     59.166.0.3   49664        udp   \n",
       "9        1421927414   149.171.126.5     53     59.166.0.5    3593        udp   \n",
       "10       1421927414   149.171.126.5     53     59.166.0.5    3593        udp   \n",
       "11       1421927414  149.171.126.16     80   175.45.176.0   13284        tcp   \n",
       "12       1421927414  149.171.126.16     80   175.45.176.0   13284        tcp   \n",
       "...             ...             ...    ...            ...     ...        ...   \n",
       "9999995  1421932056      59.166.0.3   6103  149.171.126.4   52633        tcp   \n",
       "9999996  1421932056      59.166.0.3   6103  149.171.126.4   52633        tcp   \n",
       "9999997  1421932056      59.166.0.3   6103  149.171.126.4   52633        tcp   \n",
       "9999998  1421932056      59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999999  1421932056      59.166.0.3  31666  149.171.126.5      22        tcp   \n",
       "\n",
       "         sttl  total_len   first_layer  \\\n",
       "8          30         89  cooked linux   \n",
       "9          29         82  cooked linux   \n",
       "10         30         82  cooked linux   \n",
       "11        252         48  cooked linux   \n",
       "12        253         48  cooked linux   \n",
       "...       ...        ...           ...   \n",
       "9999995    31         52  cooked linux   \n",
       "9999996    32         52  cooked linux   \n",
       "9999997    31         52  cooked linux   \n",
       "9999998    32        668  cooked linux   \n",
       "9999999    31         52  cooked linux   \n",
       "\n",
       "                                                    packet  ... HSRP reserved  \\\n",
       "8        000000010006001b17059e1c0000080045000059f17a40...  ...           NaN   \n",
       "9        000400010006005056a577630000080045000052456e40...  ...           NaN   \n",
       "10       000000010006001b17059e1c0000080045000052456e40...  ...           NaN   \n",
       "11       000400010006005056a577630000080045000030d80700...  ...           NaN   \n",
       "12       000000010006001b17059e1c0000080045000030d80700...  ...           NaN   \n",
       "...                                                    ...  ...           ...   \n",
       "9999995  000400010006005056a524c2000008004500003408ca40...  ...           NaN   \n",
       "9999996  000000010006021ac5000000000008004500003408ca40...  ...           NaN   \n",
       "9999997  000400010006005056a524c2000008004500003408cb40...  ...           NaN   \n",
       "9999998  000000010006021ac5000000000008004500029cc1f040...  ...           NaN   \n",
       "9999999  000400010006005056a524c200000800450000342d6c40...  ...           NaN   \n",
       "\n",
       "         HSRP auth  HSRP MD5 Authentication type HSRP MD5 Authentication len  \\\n",
       "8              NaN                           NaN                         NaN   \n",
       "9              NaN                           NaN                         NaN   \n",
       "10             NaN                           NaN                         NaN   \n",
       "11             NaN                           NaN                         NaN   \n",
       "12             NaN                           NaN                         NaN   \n",
       "...            ...                           ...                         ...   \n",
       "9999995        NaN                           NaN                         NaN   \n",
       "9999996        NaN                           NaN                         NaN   \n",
       "9999997        NaN                           NaN                         NaN   \n",
       "9999998        NaN                           NaN                         NaN   \n",
       "9999999        NaN                           NaN                         NaN   \n",
       "\n",
       "         HSRP MD5 Authentication algo  HSRP MD5 Authentication padding  \\\n",
       "8                                 NaN                              NaN   \n",
       "9                                 NaN                              NaN   \n",
       "10                                NaN                              NaN   \n",
       "11                                NaN                              NaN   \n",
       "12                                NaN                              NaN   \n",
       "...                               ...                              ...   \n",
       "9999995                           NaN                              NaN   \n",
       "9999996                           NaN                              NaN   \n",
       "9999997                           NaN                              NaN   \n",
       "9999998                           NaN                              NaN   \n",
       "9999999                           NaN                              NaN   \n",
       "\n",
       "        HSRP MD5 Authentication flags  HSRP MD5 Authentication sourceip  \\\n",
       "8                                 NaN                               NaN   \n",
       "9                                 NaN                               NaN   \n",
       "10                                NaN                               NaN   \n",
       "11                                NaN                               NaN   \n",
       "12                                NaN                               NaN   \n",
       "...                               ...                               ...   \n",
       "9999995                           NaN                               NaN   \n",
       "9999996                           NaN                               NaN   \n",
       "9999997                           NaN                               NaN   \n",
       "9999998                           NaN                               NaN   \n",
       "9999999                           NaN                               NaN   \n",
       "\n",
       "        HSRP MD5 Authentication keyid SCTPChunkInit addr  \n",
       "8                                 NaN                NaN  \n",
       "9                                 NaN                NaN  \n",
       "10                                NaN                NaN  \n",
       "11                                NaN                NaN  \n",
       "12                                NaN                NaN  \n",
       "...                               ...                ...  \n",
       "9999995                           NaN                NaN  \n",
       "9999996                           NaN                NaN  \n",
       "9999997                           NaN                NaN  \n",
       "9999998                           NaN                NaN  \n",
       "9999999                           NaN                NaN  \n",
       "\n",
       "[9993953 rows x 387 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "00a84644-94e5-4a92-bff6-24f97b24955d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.dropna(subset=['payload'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "121ddb1e-f3eb-494f-b748-90576c43603c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>HSRP reserved</th>\n",
       "      <th>HSRP auth</th>\n",
       "      <th>HSRP MD5 Authentication type</th>\n",
       "      <th>HSRP MD5 Authentication len</th>\n",
       "      <th>HSRP MD5 Authentication algo</th>\n",
       "      <th>HSRP MD5 Authentication padding</th>\n",
       "      <th>HSRP MD5 Authentication flags</th>\n",
       "      <th>HSRP MD5 Authentication sourceip</th>\n",
       "      <th>HSRP MD5 Authentication keyid</th>\n",
       "      <th>SCTPChunkInit addr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>2142</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052f17c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999974</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>51456</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>69</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000045c6a740...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999986</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999987</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>80</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000050c1f140...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999988</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>80</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000050c1f140...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5606670 rows × 387 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime          srcip  sport          dstip  dsport protocol_m  \\\n",
       "8        1421927414  149.171.126.0     53     59.166.0.3   49664        udp   \n",
       "9        1421927414  149.171.126.5     53     59.166.0.5    3593        udp   \n",
       "10       1421927414  149.171.126.5     53     59.166.0.5    3593        udp   \n",
       "15       1421927414  149.171.126.0     53     59.166.0.3   49664        udp   \n",
       "24       1421927414  149.171.126.4     53     59.166.0.6    2142        udp   \n",
       "...             ...            ...    ...            ...     ...        ...   \n",
       "9999974  1421932056     59.166.0.8  51456  149.171.126.3    6881        tcp   \n",
       "9999986  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999987  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999988  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999998  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "\n",
       "         sttl  total_len   first_layer  \\\n",
       "8          30         89  cooked linux   \n",
       "9          29         82  cooked linux   \n",
       "10         30         82  cooked linux   \n",
       "15         29         89  cooked linux   \n",
       "24         29         82  cooked linux   \n",
       "...       ...        ...           ...   \n",
       "9999974    32         69  cooked linux   \n",
       "9999986    31        668  cooked linux   \n",
       "9999987    31         80  cooked linux   \n",
       "9999988    32         80  cooked linux   \n",
       "9999998    32        668  cooked linux   \n",
       "\n",
       "                                                    packet  ... HSRP reserved  \\\n",
       "8        000000010006001b17059e1c0000080045000059f17a40...  ...           NaN   \n",
       "9        000400010006005056a577630000080045000052456e40...  ...           NaN   \n",
       "10       000000010006001b17059e1c0000080045000052456e40...  ...           NaN   \n",
       "15       000400010006005056a577630000080045000059f17a40...  ...           NaN   \n",
       "24       000400010006005056a577630000080045000052f17c40...  ...           NaN   \n",
       "...                                                    ...  ...           ...   \n",
       "9999974  000000010006021ac50000000000080045000045c6a740...  ...           NaN   \n",
       "9999986  000400010006005056a524c2000008004500029cc1f040...  ...           NaN   \n",
       "9999987  000400010006005056a524c20000080045000050c1f140...  ...           NaN   \n",
       "9999988  000000010006021ac50000000000080045000050c1f140...  ...           NaN   \n",
       "9999998  000000010006021ac5000000000008004500029cc1f040...  ...           NaN   \n",
       "\n",
       "         HSRP auth  HSRP MD5 Authentication type HSRP MD5 Authentication len  \\\n",
       "8              NaN                           NaN                         NaN   \n",
       "9              NaN                           NaN                         NaN   \n",
       "10             NaN                           NaN                         NaN   \n",
       "15             NaN                           NaN                         NaN   \n",
       "24             NaN                           NaN                         NaN   \n",
       "...            ...                           ...                         ...   \n",
       "9999974        NaN                           NaN                         NaN   \n",
       "9999986        NaN                           NaN                         NaN   \n",
       "9999987        NaN                           NaN                         NaN   \n",
       "9999988        NaN                           NaN                         NaN   \n",
       "9999998        NaN                           NaN                         NaN   \n",
       "\n",
       "         HSRP MD5 Authentication algo  HSRP MD5 Authentication padding  \\\n",
       "8                                 NaN                              NaN   \n",
       "9                                 NaN                              NaN   \n",
       "10                                NaN                              NaN   \n",
       "15                                NaN                              NaN   \n",
       "24                                NaN                              NaN   \n",
       "...                               ...                              ...   \n",
       "9999974                           NaN                              NaN   \n",
       "9999986                           NaN                              NaN   \n",
       "9999987                           NaN                              NaN   \n",
       "9999988                           NaN                              NaN   \n",
       "9999998                           NaN                              NaN   \n",
       "\n",
       "        HSRP MD5 Authentication flags  HSRP MD5 Authentication sourceip  \\\n",
       "8                                 NaN                               NaN   \n",
       "9                                 NaN                               NaN   \n",
       "10                                NaN                               NaN   \n",
       "15                                NaN                               NaN   \n",
       "24                                NaN                               NaN   \n",
       "...                               ...                               ...   \n",
       "9999974                           NaN                               NaN   \n",
       "9999986                           NaN                               NaN   \n",
       "9999987                           NaN                               NaN   \n",
       "9999988                           NaN                               NaN   \n",
       "9999998                           NaN                               NaN   \n",
       "\n",
       "        HSRP MD5 Authentication keyid SCTPChunkInit addr  \n",
       "8                                 NaN                NaN  \n",
       "9                                 NaN                NaN  \n",
       "10                                NaN                NaN  \n",
       "15                                NaN                NaN  \n",
       "24                                NaN                NaN  \n",
       "...                               ...                ...  \n",
       "9999974                           NaN                NaN  \n",
       "9999986                           NaN                NaN  \n",
       "9999987                           NaN                NaN  \n",
       "9999988                           NaN                NaN  \n",
       "9999998                           NaN                NaN  \n",
       "\n",
       "[5606670 rows x 387 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "93f42c9c-fb0a-43e7-996b-1be6fcfa6790",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.loc[:, df.isna().sum() <= 1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "9f7bbfa2-0ae8-408c-8258-ff6b59979bf3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>IP ihl</th>\n",
       "      <th>IP tos</th>\n",
       "      <th>IP len</th>\n",
       "      <th>IP id</th>\n",
       "      <th>IP frag</th>\n",
       "      <th>IP ttl</th>\n",
       "      <th>IP proto</th>\n",
       "      <th>IP chksum</th>\n",
       "      <th>IP src</th>\n",
       "      <th>IP dst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>61818.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>0x1bc5</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>59.166.0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>17774.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>0xc8d1</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>59.166.0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>17774.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>0xc7d1</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>59.166.0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>61818.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>0x1cc5</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>59.166.0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>2142</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052f17c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>61820.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>0x1cc3</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>59.166.0.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999974</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>51456</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>69</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000045c6a740...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>69.0</td>\n",
       "      <td>50855.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0x44af</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>149.171.126.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999986</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>668.0</td>\n",
       "      <td>49648.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0x4809</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>149.171.126.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999987</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>80</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000050c1f140...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>49649.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0x4a54</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>149.171.126.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999988</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>80</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000050c1f140...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>49649.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0x4954</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>149.171.126.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0x0</td>\n",
       "      <td>668.0</td>\n",
       "      <td>49648.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>0x4709</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>149.171.126.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5606670 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime          srcip  sport          dstip  dsport protocol_m  \\\n",
       "8        1421927414  149.171.126.0     53     59.166.0.3   49664        udp   \n",
       "9        1421927414  149.171.126.5     53     59.166.0.5    3593        udp   \n",
       "10       1421927414  149.171.126.5     53     59.166.0.5    3593        udp   \n",
       "15       1421927414  149.171.126.0     53     59.166.0.3   49664        udp   \n",
       "24       1421927414  149.171.126.4     53     59.166.0.6    2142        udp   \n",
       "...             ...            ...    ...            ...     ...        ...   \n",
       "9999974  1421932056     59.166.0.8  51456  149.171.126.3    6881        tcp   \n",
       "9999986  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999987  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999988  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "9999998  1421932056     59.166.0.8   4793  149.171.126.9      22        tcp   \n",
       "\n",
       "         sttl  total_len   first_layer  \\\n",
       "8          30         89  cooked linux   \n",
       "9          29         82  cooked linux   \n",
       "10         30         82  cooked linux   \n",
       "15         29         89  cooked linux   \n",
       "24         29         82  cooked linux   \n",
       "...       ...        ...           ...   \n",
       "9999974    32         69  cooked linux   \n",
       "9999986    31        668  cooked linux   \n",
       "9999987    31         80  cooked linux   \n",
       "9999988    32         80  cooked linux   \n",
       "9999998    32        668  cooked linux   \n",
       "\n",
       "                                                    packet  ... IP ihl  \\\n",
       "8        000000010006001b17059e1c0000080045000059f17a40...  ...    5.0   \n",
       "9        000400010006005056a577630000080045000052456e40...  ...    5.0   \n",
       "10       000000010006001b17059e1c0000080045000052456e40...  ...    5.0   \n",
       "15       000400010006005056a577630000080045000059f17a40...  ...    5.0   \n",
       "24       000400010006005056a577630000080045000052f17c40...  ...    5.0   \n",
       "...                                                    ...  ...    ...   \n",
       "9999974  000000010006021ac50000000000080045000045c6a740...  ...    5.0   \n",
       "9999986  000400010006005056a524c2000008004500029cc1f040...  ...    5.0   \n",
       "9999987  000400010006005056a524c20000080045000050c1f140...  ...    5.0   \n",
       "9999988  000000010006021ac50000000000080045000050c1f140...  ...    5.0   \n",
       "9999998  000000010006021ac5000000000008004500029cc1f040...  ...    5.0   \n",
       "\n",
       "         IP tos  IP len    IP id  IP frag  IP ttl IP proto  IP chksum  \\\n",
       "8           0x0    89.0  61818.0      0.0    30.0      udp     0x1bc5   \n",
       "9           0x0    82.0  17774.0      0.0    29.0      udp     0xc8d1   \n",
       "10          0x0    82.0  17774.0      0.0    30.0      udp     0xc7d1   \n",
       "15          0x0    89.0  61818.0      0.0    29.0      udp     0x1cc5   \n",
       "24          0x0    82.0  61820.0      0.0    29.0      udp     0x1cc3   \n",
       "...         ...     ...      ...      ...     ...      ...        ...   \n",
       "9999974     0x0    69.0  50855.0      0.0    32.0      tcp     0x44af   \n",
       "9999986     0x0   668.0  49648.0      0.0    31.0      tcp     0x4809   \n",
       "9999987     0x0    80.0  49649.0      0.0    31.0      tcp     0x4a54   \n",
       "9999988     0x0    80.0  49649.0      0.0    32.0      tcp     0x4954   \n",
       "9999998     0x0   668.0  49648.0      0.0    32.0      tcp     0x4709   \n",
       "\n",
       "                IP src         IP dst  \n",
       "8        149.171.126.0     59.166.0.3  \n",
       "9        149.171.126.5     59.166.0.5  \n",
       "10       149.171.126.5     59.166.0.5  \n",
       "15       149.171.126.0     59.166.0.3  \n",
       "24       149.171.126.4     59.166.0.6  \n",
       "...                ...            ...  \n",
       "9999974     59.166.0.8  149.171.126.3  \n",
       "9999986     59.166.0.8  149.171.126.9  \n",
       "9999987     59.166.0.8  149.171.126.9  \n",
       "9999988     59.166.0.8  149.171.126.9  \n",
       "9999998     59.166.0.8  149.171.126.9  \n",
       "\n",
       "[5606670 rows x 31 columns]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "80a88b89-7d03-4ab5-8a64-a4bd43bdb4a0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['stime', 'srcip', 'sport', 'dstip', 'dsport', 'protocol_m', 'sttl',\n",
       "       'total_len', 'first_layer', 'packet', 'payload', 't_delta',\n",
       "       'stime_flow', 'attack_cat', 'label', 'ltime_max',\n",
       "       'cooked linux pkttype', 'cooked linux lladdrlen', 'cooked linux src',\n",
       "       'cooked linux proto', 'IP version', 'IP ihl', 'IP tos', 'IP len',\n",
       "       'IP id', 'IP frag', 'IP ttl', 'IP proto', 'IP chksum', 'IP src',\n",
       "       'IP dst'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed1f13f0-f1c2-4c16-8ba7-5748ff0048c0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1358405/2603175129.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['payload_len'] = df.payload.apply(len)\n"
     ]
    }
   ],
   "source": [
    "df['payload_len'] = df.payload.apply(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7e9a433-a02a-4140-8e18-d992a04dae05",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['payload_len'] > 100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a089edb2-2837-411c-9b59-a84854e1366c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>HSRP auth</th>\n",
       "      <th>HSRP MD5 Authentication type</th>\n",
       "      <th>HSRP MD5 Authentication len</th>\n",
       "      <th>HSRP MD5 Authentication algo</th>\n",
       "      <th>HSRP MD5 Authentication padding</th>\n",
       "      <th>HSRP MD5 Authentication flags</th>\n",
       "      <th>HSRP MD5 Authentication sourceip</th>\n",
       "      <th>HSRP MD5 Authentication keyid</th>\n",
       "      <th>SCTPChunkInit addr</th>\n",
       "      <th>payload_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593</td>\n",
       "      <td>udp</td>\n",
       "      <td>30</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1421927414</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>2142</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>82</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052f17c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999895</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4006</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>285</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500011dc74a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999922</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37328</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254</td>\n",
       "      <td>293</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000125761700...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999923</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37328</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255</td>\n",
       "      <td>293</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000125761700...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999986</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999998</th>\n",
       "      <td>1421932056</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>668</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500029cc1f040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1232</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4370824 rows × 388 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime          srcip  sport           dstip  dsport protocol_m  \\\n",
       "8        1421927414  149.171.126.0     53      59.166.0.3   49664        udp   \n",
       "9        1421927414  149.171.126.5     53      59.166.0.5    3593        udp   \n",
       "10       1421927414  149.171.126.5     53      59.166.0.5    3593        udp   \n",
       "15       1421927414  149.171.126.0     53      59.166.0.3   49664        udp   \n",
       "24       1421927414  149.171.126.4     53      59.166.0.6    2142        udp   \n",
       "...             ...            ...    ...             ...     ...        ...   \n",
       "9999895  1421932056     59.166.0.8   4006   149.171.126.3      80        tcp   \n",
       "9999922  1421932056   175.45.176.1  37328  149.171.126.15      80        tcp   \n",
       "9999923  1421932056   175.45.176.1  37328  149.171.126.15      80        tcp   \n",
       "9999986  1421932056     59.166.0.8   4793   149.171.126.9      22        tcp   \n",
       "9999998  1421932056     59.166.0.8   4793   149.171.126.9      22        tcp   \n",
       "\n",
       "         sttl  total_len   first_layer  \\\n",
       "8          30         89  cooked linux   \n",
       "9          29         82  cooked linux   \n",
       "10         30         82  cooked linux   \n",
       "15         29         89  cooked linux   \n",
       "24         29         82  cooked linux   \n",
       "...       ...        ...           ...   \n",
       "9999895    32        285  cooked linux   \n",
       "9999922   254        293  cooked linux   \n",
       "9999923   255        293  cooked linux   \n",
       "9999986    31        668  cooked linux   \n",
       "9999998    32        668  cooked linux   \n",
       "\n",
       "                                                    packet  ... HSRP auth  \\\n",
       "8        000000010006001b17059e1c0000080045000059f17a40...  ...       NaN   \n",
       "9        000400010006005056a577630000080045000052456e40...  ...       NaN   \n",
       "10       000000010006001b17059e1c0000080045000052456e40...  ...       NaN   \n",
       "15       000400010006005056a577630000080045000059f17a40...  ...       NaN   \n",
       "24       000400010006005056a577630000080045000052f17c40...  ...       NaN   \n",
       "...                                                    ...  ...       ...   \n",
       "9999895  000000010006021ac5000000000008004500011dc74a40...  ...       NaN   \n",
       "9999922  000400010006005056a524c20000080045000125761700...  ...       NaN   \n",
       "9999923  000000010006021ac50000000000080045000125761700...  ...       NaN   \n",
       "9999986  000400010006005056a524c2000008004500029cc1f040...  ...       NaN   \n",
       "9999998  000000010006021ac5000000000008004500029cc1f040...  ...       NaN   \n",
       "\n",
       "         HSRP MD5 Authentication type  HSRP MD5 Authentication len  \\\n",
       "8                                 NaN                          NaN   \n",
       "9                                 NaN                          NaN   \n",
       "10                                NaN                          NaN   \n",
       "15                                NaN                          NaN   \n",
       "24                                NaN                          NaN   \n",
       "...                               ...                          ...   \n",
       "9999895                           NaN                          NaN   \n",
       "9999922                           NaN                          NaN   \n",
       "9999923                           NaN                          NaN   \n",
       "9999986                           NaN                          NaN   \n",
       "9999998                           NaN                          NaN   \n",
       "\n",
       "        HSRP MD5 Authentication algo  HSRP MD5 Authentication padding  \\\n",
       "8                                NaN                              NaN   \n",
       "9                                NaN                              NaN   \n",
       "10                               NaN                              NaN   \n",
       "15                               NaN                              NaN   \n",
       "24                               NaN                              NaN   \n",
       "...                              ...                              ...   \n",
       "9999895                          NaN                              NaN   \n",
       "9999922                          NaN                              NaN   \n",
       "9999923                          NaN                              NaN   \n",
       "9999986                          NaN                              NaN   \n",
       "9999998                          NaN                              NaN   \n",
       "\n",
       "         HSRP MD5 Authentication flags HSRP MD5 Authentication sourceip  \\\n",
       "8                                  NaN                              NaN   \n",
       "9                                  NaN                              NaN   \n",
       "10                                 NaN                              NaN   \n",
       "15                                 NaN                              NaN   \n",
       "24                                 NaN                              NaN   \n",
       "...                                ...                              ...   \n",
       "9999895                            NaN                              NaN   \n",
       "9999922                            NaN                              NaN   \n",
       "9999923                            NaN                              NaN   \n",
       "9999986                            NaN                              NaN   \n",
       "9999998                            NaN                              NaN   \n",
       "\n",
       "         HSRP MD5 Authentication keyid SCTPChunkInit addr payload_len  \n",
       "8                                  NaN                NaN         122  \n",
       "9                                  NaN                NaN         108  \n",
       "10                                 NaN                NaN         108  \n",
       "15                                 NaN                NaN         122  \n",
       "24                                 NaN                NaN         108  \n",
       "...                                ...                ...         ...  \n",
       "9999895                            NaN                NaN         466  \n",
       "9999922                            NaN                NaN         506  \n",
       "9999923                            NaN                NaN         506  \n",
       "9999986                            NaN                NaN        1232  \n",
       "9999998                            NaN                NaN        1232  \n",
       "\n",
       "[4370824 rows x 388 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f8c49b19-05d7-484c-9d0a-421c1989a674",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            4137696\n",
       "exploits           141748\n",
       "dos                 48767\n",
       "fuzzers             20695\n",
       "generic             18036\n",
       "reconnaissance       2264\n",
       "worms                1248\n",
       "shellcode             246\n",
       "backdoor              124\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4eadafd-bf02-49c3-9618-40d1a15ef4ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "count_dict = df['attack_cat'].value_counts().to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "629ad092-a22f-4a07-ab29-6e16dc715447",
   "metadata": {},
   "outputs": [],
   "source": [
    "total_sample_size = 5000\n",
    "sample_size_per_category = total_sample_size // len(count_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "c805bb34-9fce-499e-9d5a-648005ea5d8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_df = pd.DataFrame()\n",
    "\n",
    "for category in proportions.keys():\n",
    "    num_samples = min(len(df[df['attack_cat'] == category]), sample_size_per_category)\n",
    "    samples = df[df['attack_cat'] == category].sample(n=num_samples)\n",
    "    sample_df = pd.concat([sample_df, samples])\n",
    "\n",
    "if len(sample_df) < total_sample_size:\n",
    "    additional_samples = total_sample_size - len(sample_df)\n",
    "    normal_rows = df[df['attack_cat'] == 'normal']\n",
    "    normal_rows = normal_rows.loc[~normal_rows.index.isin(sample_df.index)]\n",
    "    normal_samples = normal_rows.sample(n=min(len(normal_rows), additional_samples))\n",
    "    sample_df = pd.concat([sample_df, normal_samples])\n",
    "\n",
    "sample_df.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "769250a2-b70e-40e4-b0a9-0d72fb1c3b24",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>HSRP auth</th>\n",
       "      <th>HSRP MD5 Authentication type</th>\n",
       "      <th>HSRP MD5 Authentication len</th>\n",
       "      <th>HSRP MD5 Authentication algo</th>\n",
       "      <th>HSRP MD5 Authentication padding</th>\n",
       "      <th>HSRP MD5 Authentication flags</th>\n",
       "      <th>HSRP MD5 Authentication sourceip</th>\n",
       "      <th>HSRP MD5 Authentication keyid</th>\n",
       "      <th>SCTPChunkInit addr</th>\n",
       "      <th>payload_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1421927938</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>4785</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>29746</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac500000000000800450005dccb4e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1421930041</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>13417</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>34687</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1352</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000548ab6f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1421929999</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>8502</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dccf5a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1421930859</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>14589</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>63488</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1352</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000548bc8f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1421927594</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>80</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>61722</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc2f0a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4995</th>\n",
       "      <td>1421927479</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>17586</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcc88140...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4996</th>\n",
       "      <td>1421931865</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>43379</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc014d40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4997</th>\n",
       "      <td>1421930827</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>143</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>62912</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>204</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450000cc0fb240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4998</th>\n",
       "      <td>1421932036</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>53</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>63717</td>\n",
       "      <td>udp</td>\n",
       "      <td>29</td>\n",
       "      <td>89</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059f4e740...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4999</th>\n",
       "      <td>1421930215</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>6881</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>40690</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30</td>\n",
       "      <td>1500</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc3cc440...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2896</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5000 rows × 388 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           stime          srcip  sport          dstip  dsport protocol_m  \\\n",
       "0     1421927938     59.166.0.1   4785  149.171.126.3   29746        tcp   \n",
       "1     1421930041  149.171.126.1  13417     59.166.0.7   34687        tcp   \n",
       "2     1421929999  149.171.126.4     80     59.166.0.0    8502        tcp   \n",
       "3     1421930859  149.171.126.1  14589     59.166.0.3   63488        tcp   \n",
       "4     1421927594  149.171.126.1     80     59.166.0.2   61722        tcp   \n",
       "...          ...            ...    ...            ...     ...        ...   \n",
       "4995  1421927479  149.171.126.1   6881     59.166.0.2   17586        tcp   \n",
       "4996  1421931865  149.171.126.5   6881     59.166.0.6   43379        tcp   \n",
       "4997  1421930827  149.171.126.4    143     59.166.0.3   62912        tcp   \n",
       "4998  1421932036  149.171.126.6     53     59.166.0.0   63717        udp   \n",
       "4999  1421930215  149.171.126.5   6881     59.166.0.1   40690        tcp   \n",
       "\n",
       "      sttl  total_len   first_layer  \\\n",
       "0       32       1500  cooked linux   \n",
       "1       30       1352  cooked linux   \n",
       "2       29       1500  cooked linux   \n",
       "3       30       1352  cooked linux   \n",
       "4       30       1500  cooked linux   \n",
       "...    ...        ...           ...   \n",
       "4995    30       1500  cooked linux   \n",
       "4996    30       1500  cooked linux   \n",
       "4997    30        204  cooked linux   \n",
       "4998    29         89  cooked linux   \n",
       "4999    30       1500  cooked linux   \n",
       "\n",
       "                                                 packet  ... HSRP auth  \\\n",
       "0     000000010006021ac500000000000800450005dccb4e40...  ...       NaN   \n",
       "1     000000010006001b17059e1c0000080045000548ab6f40...  ...       NaN   \n",
       "2     000400010006005056a5776300000800450005dccf5a40...  ...       NaN   \n",
       "3     000000010006001b17059e1c0000080045000548bc8f40...  ...       NaN   \n",
       "4     000000010006001b17059e1c00000800450005dc2f0a40...  ...       NaN   \n",
       "...                                                 ...  ...       ...   \n",
       "4995  000000010006001b17059e1c00000800450005dcc88140...  ...       NaN   \n",
       "4996  000000010006001b17059e1c00000800450005dc014d40...  ...       NaN   \n",
       "4997  000000010006001b17059e1c00000800450000cc0fb240...  ...       NaN   \n",
       "4998  000400010006005056a577630000080045000059f4e740...  ...       NaN   \n",
       "4999  000000010006001b17059e1c00000800450005dc3cc440...  ...       NaN   \n",
       "\n",
       "      HSRP MD5 Authentication type  HSRP MD5 Authentication len  \\\n",
       "0                              NaN                          NaN   \n",
       "1                              NaN                          NaN   \n",
       "2                              NaN                          NaN   \n",
       "3                              NaN                          NaN   \n",
       "4                              NaN                          NaN   \n",
       "...                            ...                          ...   \n",
       "4995                           NaN                          NaN   \n",
       "4996                           NaN                          NaN   \n",
       "4997                           NaN                          NaN   \n",
       "4998                           NaN                          NaN   \n",
       "4999                           NaN                          NaN   \n",
       "\n",
       "     HSRP MD5 Authentication algo  HSRP MD5 Authentication padding  \\\n",
       "0                             NaN                              NaN   \n",
       "1                             NaN                              NaN   \n",
       "2                             NaN                              NaN   \n",
       "3                             NaN                              NaN   \n",
       "4                             NaN                              NaN   \n",
       "...                           ...                              ...   \n",
       "4995                          NaN                              NaN   \n",
       "4996                          NaN                              NaN   \n",
       "4997                          NaN                              NaN   \n",
       "4998                          NaN                              NaN   \n",
       "4999                          NaN                              NaN   \n",
       "\n",
       "      HSRP MD5 Authentication flags HSRP MD5 Authentication sourceip  \\\n",
       "0                               NaN                              NaN   \n",
       "1                               NaN                              NaN   \n",
       "2                               NaN                              NaN   \n",
       "3                               NaN                              NaN   \n",
       "4                               NaN                              NaN   \n",
       "...                             ...                              ...   \n",
       "4995                            NaN                              NaN   \n",
       "4996                            NaN                              NaN   \n",
       "4997                            NaN                              NaN   \n",
       "4998                            NaN                              NaN   \n",
       "4999                            NaN                              NaN   \n",
       "\n",
       "      HSRP MD5 Authentication keyid SCTPChunkInit addr payload_len  \n",
       "0                               NaN                NaN        2896  \n",
       "1                               NaN                NaN        2600  \n",
       "2                               NaN                NaN        2896  \n",
       "3                               NaN                NaN        2600  \n",
       "4                               NaN                NaN        2896  \n",
       "...                             ...                ...         ...  \n",
       "4995                            NaN                NaN        2896  \n",
       "4996                            NaN                NaN        2896  \n",
       "4997                            NaN                NaN         304  \n",
       "4998                            NaN                NaN         122  \n",
       "4999                            NaN                NaN        2896  \n",
       "\n",
       "[5000 rows x 388 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c5d182cf-6804-4290-aff9-51dbd30ab33c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            1300\n",
       "exploits           555\n",
       "dos                555\n",
       "fuzzers            555\n",
       "generic            555\n",
       "reconnaissance     555\n",
       "worms              555\n",
       "shellcode          246\n",
       "backdoor           124\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_df['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93fca778-976e-431d-8597-1cb36f781ccd",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Concatenate Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "164555f5-cd60-4437-a007-b9fa053ff22a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_4155335/230593763.py:6: DtypeWarning: Columns (31,32,44,45,55,66,69,70,71,72,74,75,76,77,79,81,82,85,86,87,93,96,99,101,102,107,109,112,115,118,119,122,123,124,126,128,129,130,131,132,133,134,135,136,138,139,140,143,144,148,149,150,154,155,156,159,161,162,163,164,165,166,167,168,169,174,175,176,178,181,183,185,186,195,200,201,202,204,206,207,209,210,211,212,213,214,218,219,223,226,227,228,230,231,238,239,240,249,251,252,253,254,257,258,260,261,264,267,270,271,283,284,285,286,287,288,291,292,293,294,295,296,297,299,300) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n",
      "/tmp/ipykernel_4155335/230593763.py:6: DtypeWarning: Columns (33,44,46,56,67,70,71,72,75,76,77,83,84,85,86,88,89,90,92,93,95,96,97,98,99,100,101,103,104,105,107,109,110,112,113,115,118,120,122,124,127,130,133,136,138,141,144,146,147,148,149,151,154,155,159,160,161,164,166,167,168,169,170,171,172,173,174,175,176,177,178,181,182,185,186,189,192,193,194,195,196,197,202,205,206,207,211,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,235,237,240,243,246,249,252,253,254,255,256,259,264,267,269,271,273,274,275,276,280,283,285,287,288,289,290,293) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n",
      "/tmp/ipykernel_4155335/230593763.py:6: DtypeWarning: Columns (33,45,55,66,67,68,70,71,74,75,77,78,81,82,83,84,85,86,87,88,89,90,93,94,95,96,98,101,103,104,106,107,109,110,111,112,115,117,118,119,120,121,122,123,124,126,127,131,132,133,135,137,140,141,142,143,144,149,151,154,157,160,163,166,167,168,174,186,187,188,189,190,191,195,196,197,198,200,201,202,203,212,217,218,219,221,223,224,225,226,227,228,229,231,233,234,238,240,243,244,245,246,247,249,250,252,253,255,258,260,262,264,267,270,273,275,278,281,284,286,287) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n",
      "/tmp/ipykernel_4155335/230593763.py:6: DtypeWarning: Columns (33,45,50,55,58,63,64,65,66,67,68,70,71,72,73,74,75,76,77,78,84,86,87,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,114,115,116,118,130,131,132,134,135,136,137,142,144,147,150,153,156,159,160,161,162,163,166,167,170,173,177,180,181,184,185,186,188,190,193,194,198,205,206,207,210,212,213,214,215,216,217,218,219,220,227,232,234,237,239,241,242,243,245,246,248,249,250,251,252,253,257,258,262,265,266,267,268,269,270,271,272,273,276,277,279,280,281,282,283,284,285,286,288,289,290) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n",
      "/tmp/ipykernel_4155335/230593763.py:6: DtypeWarning: Columns (33,44,45,48,56,67,70,71,72,75,76,77,83,84,86,88,89,90,93,95,96,97,98,99,100,101,102,103,104,107,108,110,112,113,114,116,119,121,127,128,129,130,131,132,133,138,142,144,147,148,149,150,151,152,153,154,155,156,157,158,160,161,162,164,165,166,167,169,170,179,184,185,186,188,190,192,193,195,198,200,202,204,207,210,213,216,219,222,223,224,229,231,232,235,236,238,240,241,242,243,244,256,257,258,261,262,266,267,268,269,270,271,276,278,281,284,287,290,293,294,295,296,299,304,307,308,310,314,317,319,321,322) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n",
      "/tmp/ipykernel_4155335/230593763.py:6: DtypeWarning: Columns (33,44,45,50,55,58,63,64,65,66,67,68,69,70,71,72,74,75,76,80,81,82,84,85,87,88,89,90,91,92,97,100,101,102,105,106,107,108,110,111,112,114,115,116,117,118,120,123,124,126,127,130,132,133,134,135,136,137,138,139,140,141,142,143,144,146,147,150,151,152,153,154,155,156,157,160,161,162,163,165,166,168,169,171,174,175,176,179,180,181,182,187,189,192,195,196,199,202,205,206,207,214,215,216,217,218,220,221,222,234,235,236,240,241,242,254,255,256,257,260,261,262,263,264,265,266,268,269,270,271,272,274,275,277,280,282,284,286,289,292,295,298,300,303,306,308,309) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n"
     ]
    }
   ],
   "source": [
    "df = pd.DataFrame()\n",
    "for i in range(13,20):\n",
    "    if not os.path.isfile(f'./UNSW/UNSW-1/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Reading CSV File {i} ----->>>>>>>>')\n",
    "    temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')\n",
    "    df = pd.concat([df, temp_df], ignore_index=True)\n",
    "    write_log(f'------------ CSV File {i} added to DataFrame ------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "000ecde6-00fa-466f-99ff-b7d3fc0e9f9f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>DCE/RPC v5 - Bind reserved</th>\n",
       "      <th>DCE/RPC v5 - Bind if_uuid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.2</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>65090.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049a5ae40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.421928e+09</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>47518.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc590f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421940e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>5190.0</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>40335.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500004ca61f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.4</td>\n",
       "      <td>55114.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcb9a540...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421943e+09</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>34569.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>521.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000209344240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287764</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.2</td>\n",
       "      <td>20503.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>33532.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>313.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000139a19e00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287765</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000094043500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287766</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000094043500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287767</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003603ad00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287768</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003603ad00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>287769 rows × 66 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime           srcip    sport           dstip   dsport  \\\n",
       "0       1.421933e+09   149.171.126.2    143.0      59.166.0.6  65090.0   \n",
       "1       1.421928e+09   149.171.126.6     80.0      59.166.0.7  47518.0   \n",
       "2       1.421940e+09   149.171.126.4   5190.0      59.166.0.0  40335.0   \n",
       "3       1.421933e+09   149.171.126.0   6881.0      59.166.0.4  55114.0   \n",
       "4       1.421943e+09   149.171.126.8   6881.0      59.166.0.1  34569.0   \n",
       "...              ...             ...      ...             ...      ...   \n",
       "287764  1.424262e+09    175.45.176.2  20503.0  149.171.126.10  33532.0   \n",
       "287765  1.424262e+09  149.171.126.12   1014.0    175.45.176.0  60251.0   \n",
       "287766  1.424262e+09  149.171.126.12   1014.0    175.45.176.0  60251.0   \n",
       "287767  1.424262e+09    175.45.176.0  60251.0  149.171.126.12   1014.0   \n",
       "287768  1.424262e+09    175.45.176.0  60251.0  149.171.126.12   1014.0   \n",
       "\n",
       "       protocol_m   sttl  total_len   first_layer  \\\n",
       "0             tcp   30.0       73.0  cooked linux   \n",
       "1             tcp   30.0     1500.0  cooked linux   \n",
       "2             tcp   29.0       76.0  cooked linux   \n",
       "3             tcp   29.0     1500.0  cooked linux   \n",
       "4             tcp   29.0      521.0  cooked linux   \n",
       "...           ...    ...        ...           ...   \n",
       "287764        tcp  254.0      313.0  cooked linux   \n",
       "287765        tcp  253.0      148.0  cooked linux   \n",
       "287766        tcp  252.0      148.0  cooked linux   \n",
       "287767        tcp  255.0       54.0  cooked linux   \n",
       "287768        tcp  254.0       54.0  cooked linux   \n",
       "\n",
       "                                                   packet  ...  \\\n",
       "0       000000010006001b17059e1c0000080045000049a5ae40...  ...   \n",
       "1       000000010006001b17059e1c00000800450005dc590f40...  ...   \n",
       "2       000400010006005056a57763000008004500004ca61f40...  ...   \n",
       "3       000400010006005056a5776300000800450005dcb9a540...  ...   \n",
       "4       000400010006005056a577630000080045000209344240...  ...   \n",
       "...                                                   ...  ...   \n",
       "287764  000400010006005056a524c20000080045000139a19e00...  ...   \n",
       "287765  000000010006001b17059e1c0000080045000094043500...  ...   \n",
       "287766  000400010006005056a577630000080045000094043500...  ...   \n",
       "287767  000000010006021ac5000000000008004500003603ad00...  ...   \n",
       "287768  000400010006005056a524c2000008004500003603ad00...  ...   \n",
       "\n",
       "       DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  DCE/RPC v5 endian  \\\n",
       "0                   NaN                   NaN                NaN   \n",
       "1                   NaN                   NaN                NaN   \n",
       "2                   NaN                   NaN                NaN   \n",
       "3                   NaN                   NaN                NaN   \n",
       "4                   NaN                   NaN                NaN   \n",
       "...                 ...                   ...                ...   \n",
       "287764              NaN                   NaN                NaN   \n",
       "287765              NaN                   NaN                NaN   \n",
       "287766              NaN                   NaN                NaN   \n",
       "287767              NaN                   NaN                NaN   \n",
       "287768              NaN                   NaN                NaN   \n",
       "\n",
       "       DCE/RPC v5 encoding  DCE/RPC v5 float  DCE/RPC v5 frag_len  \\\n",
       "0                      NaN               NaN                  NaN   \n",
       "1                      NaN               NaN                  NaN   \n",
       "2                      NaN               NaN                  NaN   \n",
       "3                      NaN               NaN                  NaN   \n",
       "4                      NaN               NaN                  NaN   \n",
       "...                    ...               ...                  ...   \n",
       "287764                 NaN               NaN                  NaN   \n",
       "287765                 NaN               NaN                  NaN   \n",
       "287766                 NaN               NaN                  NaN   \n",
       "287767                 NaN               NaN                  NaN   \n",
       "287768                 NaN               NaN                  NaN   \n",
       "\n",
       "       DCE/RPC v5 auth_len  DCE/RPC v5 call_id DCE/RPC v5 - Bind reserved  \\\n",
       "0                      NaN                 NaN                        NaN   \n",
       "1                      NaN                 NaN                        NaN   \n",
       "2                      NaN                 NaN                        NaN   \n",
       "3                      NaN                 NaN                        NaN   \n",
       "4                      NaN                 NaN                        NaN   \n",
       "...                    ...                 ...                        ...   \n",
       "287764                 NaN                 NaN                        NaN   \n",
       "287765                 NaN                 NaN                        NaN   \n",
       "287766                 NaN                 NaN                        NaN   \n",
       "287767                 NaN                 NaN                        NaN   \n",
       "287768                 NaN                 NaN                        NaN   \n",
       "\n",
       "       DCE/RPC v5 - Bind if_uuid  \n",
       "0                            NaN  \n",
       "1                            NaN  \n",
       "2                            NaN  \n",
       "3                            NaN  \n",
       "4                            NaN  \n",
       "...                          ...  \n",
       "287764                       NaN  \n",
       "287765                       NaN  \n",
       "287766                       NaN  \n",
       "287767                       NaN  \n",
       "287768                       NaN  \n",
       "\n",
       "[287769 rows x 66 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "eebb7ab7-b058-4b96-b369-d8b61112c21a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            32636134\n",
       "exploits            287057\n",
       "dos                  88175\n",
       "fuzzers              47422\n",
       "generic              39075\n",
       "reconnaissance        8264\n",
       "worms                 1733\n",
       "shellcode              918\n",
       "analysis               483\n",
       "backdoor               449\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts() # Part 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a14d5569-6323-4e22-b785-62ac1d0028e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            32208840\n",
       "exploits            552233\n",
       "dos                 164759\n",
       "generic              75602\n",
       "fuzzers              61236\n",
       "reconnaissance       25306\n",
       "worms                 3026\n",
       "shellcode             1785\n",
       "analysis              1100\n",
       "backdoor               890\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts() # Part 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "847441f2-3ac7-40fb-8da2-4b640e749793",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            31653081\n",
       "exploits           1104825\n",
       "dos                 317321\n",
       "generic             162265\n",
       "fuzzers             144444\n",
       "reconnaissance       32193\n",
       "worms                 6619\n",
       "shellcode             3549\n",
       "analysis              2288\n",
       "backdoor              2028\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts() # Part 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9f9e188e-1c42-4b7c-8d68-c8ff2c513d37",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(f'./UNSW/UNSW-2/output3.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4542ffeb-8b4f-4be8-b4ca-bbbb47325450",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query an</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query ns</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query rrname</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query type</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query rclass</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query ttl</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query rdlen</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query rdata</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query load</th>\n",
       "      <th>Link Local Multicast Node Resolution - Query ar</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>2142.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052f17c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49465261</th>\n",
       "      <td>1.421972e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>21.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>46392.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500003a76d440...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49465262</th>\n",
       "      <td>1.421972e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>21.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>46392.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c000008004500003a76d440...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49465263</th>\n",
       "      <td>1.421972e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>47614.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000064f7e140...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49465264</th>\n",
       "      <td>1.421972e+09</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>22.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>59131.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000064e83a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49465265</th>\n",
       "      <td>1.421972e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>47614.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000064f7e140...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>49465266 rows × 366 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 stime          srcip  sport       dstip   dsport protocol_m  \\\n",
       "0         1.421927e+09  149.171.126.0   53.0  59.166.0.3  49664.0        udp   \n",
       "1         1.421927e+09  149.171.126.5   53.0  59.166.0.5   3593.0        udp   \n",
       "2         1.421927e+09  149.171.126.5   53.0  59.166.0.5   3593.0        udp   \n",
       "3         1.421927e+09  149.171.126.0   53.0  59.166.0.3  49664.0        udp   \n",
       "4         1.421927e+09  149.171.126.4   53.0  59.166.0.6   2142.0        udp   \n",
       "...                ...            ...    ...         ...      ...        ...   \n",
       "49465261  1.421972e+09  149.171.126.1   21.0  59.166.0.5  46392.0        tcp   \n",
       "49465262  1.421972e+09  149.171.126.1   21.0  59.166.0.5  46392.0        tcp   \n",
       "49465263  1.421972e+09  149.171.126.0   22.0  59.166.0.5  47614.0        tcp   \n",
       "49465264  1.421972e+09  149.171.126.8   22.0  59.166.0.7  59131.0        tcp   \n",
       "49465265  1.421972e+09  149.171.126.0   22.0  59.166.0.5  47614.0        tcp   \n",
       "\n",
       "          sttl  total_len   first_layer  \\\n",
       "0         30.0       89.0  cooked linux   \n",
       "1         29.0       82.0  cooked linux   \n",
       "2         30.0       82.0  cooked linux   \n",
       "3         29.0       89.0  cooked linux   \n",
       "4         29.0       82.0  cooked linux   \n",
       "...        ...        ...           ...   \n",
       "49465261  29.0       58.0  cooked linux   \n",
       "49465262  30.0       58.0  cooked linux   \n",
       "49465263  29.0      100.0  cooked linux   \n",
       "49465264  30.0      100.0  cooked linux   \n",
       "49465265  30.0      100.0  cooked linux   \n",
       "\n",
       "                                                     packet  ...  \\\n",
       "0         000000010006001b17059e1c0000080045000059f17a40...  ...   \n",
       "1         000400010006005056a577630000080045000052456e40...  ...   \n",
       "2         000000010006001b17059e1c0000080045000052456e40...  ...   \n",
       "3         000400010006005056a577630000080045000059f17a40...  ...   \n",
       "4         000400010006005056a577630000080045000052f17c40...  ...   \n",
       "...                                                     ...  ...   \n",
       "49465261  000400010006005056a57763000008004500003a76d440...  ...   \n",
       "49465262  000000010006001b17059e1c000008004500003a76d440...  ...   \n",
       "49465263  000400010006005056a577630000080045000064f7e140...  ...   \n",
       "49465264  000000010006001b17059e1c0000080045000064e83a40...  ...   \n",
       "49465265  000000010006001b17059e1c0000080045000064f7e140...  ...   \n",
       "\n",
       "         Link Local Multicast Node Resolution - Query an  \\\n",
       "0                                                    NaN   \n",
       "1                                                    NaN   \n",
       "2                                                    NaN   \n",
       "3                                                    NaN   \n",
       "4                                                    NaN   \n",
       "...                                                  ...   \n",
       "49465261                                             NaN   \n",
       "49465262                                             NaN   \n",
       "49465263                                             NaN   \n",
       "49465264                                             NaN   \n",
       "49465265                                             NaN   \n",
       "\n",
       "          Link Local Multicast Node Resolution - Query ns  \\\n",
       "0                                                     NaN   \n",
       "1                                                     NaN   \n",
       "2                                                     NaN   \n",
       "3                                                     NaN   \n",
       "4                                                     NaN   \n",
       "...                                                   ...   \n",
       "49465261                                              NaN   \n",
       "49465262                                              NaN   \n",
       "49465263                                              NaN   \n",
       "49465264                                              NaN   \n",
       "49465265                                              NaN   \n",
       "\n",
       "          Link Local Multicast Node Resolution - Query rrname  \\\n",
       "0                                                       NaN     \n",
       "1                                                       NaN     \n",
       "2                                                       NaN     \n",
       "3                                                       NaN     \n",
       "4                                                       NaN     \n",
       "...                                                     ...     \n",
       "49465261                                                NaN     \n",
       "49465262                                                NaN     \n",
       "49465263                                                NaN     \n",
       "49465264                                                NaN     \n",
       "49465265                                                NaN     \n",
       "\n",
       "         Link Local Multicast Node Resolution - Query type  \\\n",
       "0                                                      NaN   \n",
       "1                                                      NaN   \n",
       "2                                                      NaN   \n",
       "3                                                      NaN   \n",
       "4                                                      NaN   \n",
       "...                                                    ...   \n",
       "49465261                                               NaN   \n",
       "49465262                                               NaN   \n",
       "49465263                                               NaN   \n",
       "49465264                                               NaN   \n",
       "49465265                                               NaN   \n",
       "\n",
       "          Link Local Multicast Node Resolution - Query rclass  \\\n",
       "0                                                       NaN     \n",
       "1                                                       NaN     \n",
       "2                                                       NaN     \n",
       "3                                                       NaN     \n",
       "4                                                       NaN     \n",
       "...                                                     ...     \n",
       "49465261                                                NaN     \n",
       "49465262                                                NaN     \n",
       "49465263                                                NaN     \n",
       "49465264                                                NaN     \n",
       "49465265                                                NaN     \n",
       "\n",
       "          Link Local Multicast Node Resolution - Query ttl  \\\n",
       "0                                                      NaN   \n",
       "1                                                      NaN   \n",
       "2                                                      NaN   \n",
       "3                                                      NaN   \n",
       "4                                                      NaN   \n",
       "...                                                    ...   \n",
       "49465261                                               NaN   \n",
       "49465262                                               NaN   \n",
       "49465263                                               NaN   \n",
       "49465264                                               NaN   \n",
       "49465265                                               NaN   \n",
       "\n",
       "         Link Local Multicast Node Resolution - Query rdlen  \\\n",
       "0                                                       NaN   \n",
       "1                                                       NaN   \n",
       "2                                                       NaN   \n",
       "3                                                       NaN   \n",
       "4                                                       NaN   \n",
       "...                                                     ...   \n",
       "49465261                                                NaN   \n",
       "49465262                                                NaN   \n",
       "49465263                                                NaN   \n",
       "49465264                                                NaN   \n",
       "49465265                                                NaN   \n",
       "\n",
       "          Link Local Multicast Node Resolution - Query rdata  \\\n",
       "0                                                       NaN    \n",
       "1                                                       NaN    \n",
       "2                                                       NaN    \n",
       "3                                                       NaN    \n",
       "4                                                       NaN    \n",
       "...                                                     ...    \n",
       "49465261                                                NaN    \n",
       "49465262                                                NaN    \n",
       "49465263                                                NaN    \n",
       "49465264                                                NaN    \n",
       "49465265                                                NaN    \n",
       "\n",
       "         Link Local Multicast Node Resolution - Query load  \\\n",
       "0                                                      NaN   \n",
       "1                                                      NaN   \n",
       "2                                                      NaN   \n",
       "3                                                      NaN   \n",
       "4                                                      NaN   \n",
       "...                                                    ...   \n",
       "49465261                                               NaN   \n",
       "49465262                                               NaN   \n",
       "49465263                                               NaN   \n",
       "49465264                                               NaN   \n",
       "49465265                                               NaN   \n",
       "\n",
       "         Link Local Multicast Node Resolution - Query ar  \n",
       "0                                                    NaN  \n",
       "1                                                    NaN  \n",
       "2                                                    NaN  \n",
       "3                                                    NaN  \n",
       "4                                                    NaN  \n",
       "...                                                  ...  \n",
       "49465261                                             NaN  \n",
       "49465262                                             NaN  \n",
       "49465263                                             NaN  \n",
       "49465264                                             NaN  \n",
       "49465265                                             NaN  \n",
       "\n",
       "[49465266 rows x 366 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1766f35-62d4-43c2-940b-f09aa568eb5a",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Sampling DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "608a75c7-7d80-4abd-997a-372254dd56f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3335661/1943139555.py:1: DtypeWarning: Columns (33,34,39,44,47,52,53,54,55,56,57,59,60,61,62,64,74,75,76,77,79,81,82,83,84,85,86,87,89,90,91,93,94,95,97,98,99,100,105,107,110,113,116,119,122,123,124,125,126,128,131,132,134,137,139,140,146,150,153,156,157,160,162,163,164,166,169,170,172,173,175,176,178,180,182,183,184,185,188,190,191,192,193,194,195,196,197,198,202,205,207,209,210,212,214,216,218,220,223,225,226,227,230,231,232,233,234,235,240,243,244,245,246,247,251,252,253,254,255,256,259,260,261,262,263,264,265,267,269,270,271,272,273,274,276,277,286,291,292,293,295,297,299,302,304,305,306,307,308,309,310,311,312,313,314,315,316,317,329,330,331,332,337,338,339) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv('./UNSW/UNSW-2/output1.csv')\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('./UNSW/UNSW-2/output1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a59358bf-ff25-48a9-bb2a-81e5d8c71b76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>802.15.4 Beacon src_panid</th>\n",
       "      <th>802.15.4 Beacon src_addr</th>\n",
       "      <th>L2TP hdr</th>\n",
       "      <th>L2TP offset</th>\n",
       "      <th>L2TP len</th>\n",
       "      <th>L2TP ns</th>\n",
       "      <th>L2TP nr</th>\n",
       "      <th>MGCP verb</th>\n",
       "      <th>MGCP endpoint</th>\n",
       "      <th>MGCP version</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.5</td>\n",
       "      <td>3593.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000052456e40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>49664.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000059f17a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421927e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>53.0</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>2142.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000052f17c40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33109705</th>\n",
       "      <td>1.421957e+09</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>35057.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc5cfe40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33109706</th>\n",
       "      <td>1.421957e+09</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>21827.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcf55240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33109707</th>\n",
       "      <td>1.421957e+09</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>21827.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcf55240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33109708</th>\n",
       "      <td>1.421957e+09</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>21827.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcf55040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33109709</th>\n",
       "      <td>1.421957e+09</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>21827.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcf55040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>33109710 rows × 340 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 stime          srcip   sport       dstip   dsport protocol_m  \\\n",
       "0         1.421927e+09  149.171.126.0    53.0  59.166.0.3  49664.0        udp   \n",
       "1         1.421927e+09  149.171.126.5    53.0  59.166.0.5   3593.0        udp   \n",
       "2         1.421927e+09  149.171.126.5    53.0  59.166.0.5   3593.0        udp   \n",
       "3         1.421927e+09  149.171.126.0    53.0  59.166.0.3  49664.0        udp   \n",
       "4         1.421927e+09  149.171.126.4    53.0  59.166.0.6   2142.0        udp   \n",
       "...                ...            ...     ...         ...      ...        ...   \n",
       "33109705  1.421957e+09  149.171.126.6    80.0  59.166.0.9  35057.0        tcp   \n",
       "33109706  1.421957e+09  149.171.126.3  6881.0  59.166.0.7  21827.0        tcp   \n",
       "33109707  1.421957e+09  149.171.126.3  6881.0  59.166.0.7  21827.0        tcp   \n",
       "33109708  1.421957e+09  149.171.126.3  6881.0  59.166.0.7  21827.0        tcp   \n",
       "33109709  1.421957e+09  149.171.126.3  6881.0  59.166.0.7  21827.0        tcp   \n",
       "\n",
       "          sttl  total_len   first_layer  \\\n",
       "0         30.0       89.0  cooked linux   \n",
       "1         29.0       82.0  cooked linux   \n",
       "2         30.0       82.0  cooked linux   \n",
       "3         29.0       89.0  cooked linux   \n",
       "4         29.0       82.0  cooked linux   \n",
       "...        ...        ...           ...   \n",
       "33109705  30.0     1500.0  cooked linux   \n",
       "33109706  29.0     1500.0  cooked linux   \n",
       "33109707  30.0     1500.0  cooked linux   \n",
       "33109708  29.0     1500.0  cooked linux   \n",
       "33109709  30.0     1500.0  cooked linux   \n",
       "\n",
       "                                                     packet  ...  \\\n",
       "0         000000010006001b17059e1c0000080045000059f17a40...  ...   \n",
       "1         000400010006005056a577630000080045000052456e40...  ...   \n",
       "2         000000010006001b17059e1c0000080045000052456e40...  ...   \n",
       "3         000400010006005056a577630000080045000059f17a40...  ...   \n",
       "4         000400010006005056a577630000080045000052f17c40...  ...   \n",
       "...                                                     ...  ...   \n",
       "33109705  000000010006001b17059e1c00000800450005dc5cfe40...  ...   \n",
       "33109706  000400010006005056a5776300000800450005dcf55240...  ...   \n",
       "33109707  000000010006001b17059e1c00000800450005dcf55240...  ...   \n",
       "33109708  000400010006005056a5776300000800450005dcf55040...  ...   \n",
       "33109709  000000010006001b17059e1c00000800450005dcf55040...  ...   \n",
       "\n",
       "         802.15.4 Beacon src_panid  802.15.4 Beacon src_addr  L2TP hdr  \\\n",
       "0                              NaN                       NaN       NaN   \n",
       "1                              NaN                       NaN       NaN   \n",
       "2                              NaN                       NaN       NaN   \n",
       "3                              NaN                       NaN       NaN   \n",
       "4                              NaN                       NaN       NaN   \n",
       "...                            ...                       ...       ...   \n",
       "33109705                       NaN                       NaN       NaN   \n",
       "33109706                       NaN                       NaN       NaN   \n",
       "33109707                       NaN                       NaN       NaN   \n",
       "33109708                       NaN                       NaN       NaN   \n",
       "33109709                       NaN                       NaN       NaN   \n",
       "\n",
       "         L2TP offset  L2TP len  L2TP ns L2TP nr  MGCP verb MGCP endpoint  \\\n",
       "0                NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "1                NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "2                NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "3                NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "4                NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "...              ...       ...      ...     ...        ...           ...   \n",
       "33109705         NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "33109706         NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "33109707         NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "33109708         NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "33109709         NaN       NaN      NaN     NaN        NaN           NaN   \n",
       "\n",
       "         MGCP version  \n",
       "0                 NaN  \n",
       "1                 NaN  \n",
       "2                 NaN  \n",
       "3                 NaN  \n",
       "4                 NaN  \n",
       "...               ...  \n",
       "33109705          NaN  \n",
       "33109706          NaN  \n",
       "33109707          NaN  \n",
       "33109708          NaN  \n",
       "33109709          NaN  \n",
       "\n",
       "[33109710 rows x 340 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f5513e08-16e3-447c-aac2-9c8e6e002808",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['protocol_m']=='tcp']\n",
    "columns_to_remove = df.columns[df.notna().sum() < 1000]\n",
    "rows_to_remove = df[df[columns_to_remove].notna().any(axis=1)].index\n",
    "df = df.drop(columns_to_remove, axis=1)\n",
    "df = df.drop(rows_to_remove)\n",
    "df = df.dropna(subset='Raw load')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5aa420d2-272d-4e41-abcb-beacf4f8dbce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            31677346\n",
       "exploits            253237\n",
       "dos                  78131\n",
       "generic              30859\n",
       "fuzzers              25975\n",
       "reconnaissance        2039\n",
       "worms                 1530\n",
       "backdoor               252\n",
       "analysis               234\n",
       "shellcode              220\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a1144c86-e3b9-4f6c-acfa-2fe1365bcce4",
   "metadata": {},
   "outputs": [],
   "source": [
    "dic = {\n",
    "    'normal':18000,\n",
    "'exploits':10000,\n",
    "'dos':10000,\n",
    "'generic':5000,\n",
    "'fuzzers':5000,\n",
    "'reconnaissance':50000,\n",
    "'worms':50000,\n",
    "'shellcode':50000,\n",
    "'analysis':50000,\n",
    "    'backdoor':50000\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b4dc1f56-2196-4083-a3a9-91520274f8c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "val = df['attack_cat'].value_counts().to_dict()\n",
    "fdf = pd.DataFrame()\n",
    "for key in dic.keys():\n",
    "    if dic[key] < val[key]:\n",
    "        sdf = df[df['attack_cat']==key].sample(dic[key], ignore_index=True)\n",
    "    else:\n",
    "        sdf = df[df['attack_cat']==key]\n",
    "    fdf = pd.concat([fdf, sdf], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "802a6875-0e84-4ebe-bc78-fcc45b57a0ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "sdf = df[df['attack_cat']=='analysis']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "75818b3d-c0d4-4d04-8222-f3c4a40de8d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "sdf = df[df['attack_cat']=='normal'].sample(1500, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d9706668-0c22-41cb-b6fa-fd3f2807bc7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "fdf = pd.concat([fdf, sdf], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d81dd94c-5a4a-4944-bb89-40056e3466d3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>TCP urgptr</th>\n",
       "      <th>TCP options</th>\n",
       "      <th>Padding load</th>\n",
       "      <th>SMB Negotiate Extended Security Response (SMB) load</th>\n",
       "      <th>Skinny len</th>\n",
       "      <th>Skinny res</th>\n",
       "      <th>Skinny msg</th>\n",
       "      <th>PPTP len</th>\n",
       "      <th>PPTP type</th>\n",
       "      <th>PPTP data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.2</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>65090.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049a5ae40...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.421928e+09</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>47518.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc590f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421940e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>5190.0</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>40335.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500004ca61f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.4</td>\n",
       "      <td>55114.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcb9a540...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421943e+09</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>34569.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>521.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000209344240...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52270</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>2137.0</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>241.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac500000000000800450000f167d300...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52271</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000034b99d00...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52272</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000034b99d00...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52273</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252.0</td>\n",
       "      <td>1481.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005c91ae800...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52274</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>1481.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005c91ae800...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>52275 rows × 52 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime           srcip    sport           dstip   dsport  \\\n",
       "0      1.421933e+09   149.171.126.2    143.0      59.166.0.6  65090.0   \n",
       "1      1.421928e+09   149.171.126.6     80.0      59.166.0.7  47518.0   \n",
       "2      1.421940e+09   149.171.126.4   5190.0      59.166.0.0  40335.0   \n",
       "3      1.421933e+09   149.171.126.0   6881.0      59.166.0.4  55114.0   \n",
       "4      1.421943e+09   149.171.126.8   6881.0      59.166.0.1  34569.0   \n",
       "...             ...             ...      ...             ...      ...   \n",
       "52270  1.421934e+09    175.45.176.0   2137.0  149.171.126.15     80.0   \n",
       "52271  1.421934e+09    175.45.176.1  37794.0  149.171.126.10   9999.0   \n",
       "52272  1.421934e+09    175.45.176.1  37794.0  149.171.126.10   9999.0   \n",
       "52273  1.421934e+09  149.171.126.10   9999.0    175.45.176.1  37794.0   \n",
       "52274  1.421934e+09  149.171.126.10   9999.0    175.45.176.1  37794.0   \n",
       "\n",
       "      protocol_m   sttl  total_len   first_layer  \\\n",
       "0            tcp   30.0       73.0  cooked linux   \n",
       "1            tcp   30.0     1500.0  cooked linux   \n",
       "2            tcp   29.0       76.0  cooked linux   \n",
       "3            tcp   29.0     1500.0  cooked linux   \n",
       "4            tcp   29.0      521.0  cooked linux   \n",
       "...          ...    ...        ...           ...   \n",
       "52270        tcp  255.0      241.0  cooked linux   \n",
       "52271        tcp  254.0       52.0  cooked linux   \n",
       "52272        tcp  255.0       52.0  cooked linux   \n",
       "52273        tcp  252.0     1481.0  cooked linux   \n",
       "52274        tcp  253.0     1481.0  cooked linux   \n",
       "\n",
       "                                                  packet  ... TCP urgptr  \\\n",
       "0      000000010006001b17059e1c0000080045000049a5ae40...  ...        0.0   \n",
       "1      000000010006001b17059e1c00000800450005dc590f40...  ...        0.0   \n",
       "2      000400010006005056a57763000008004500004ca61f40...  ...        0.0   \n",
       "3      000400010006005056a5776300000800450005dcb9a540...  ...        0.0   \n",
       "4      000400010006005056a577630000080045000209344240...  ...        0.0   \n",
       "...                                                  ...  ...        ...   \n",
       "52270  000000010006021ac500000000000800450000f167d300...  ...        0.0   \n",
       "52271  000400010006005056a524c20000080045000034b99d00...  ...        0.0   \n",
       "52272  000000010006021ac50000000000080045000034b99d00...  ...        0.0   \n",
       "52273  000400010006005056a5776300000800450005c91ae800...  ...        0.0   \n",
       "52274  000000010006001b17059e1c00000800450005c91ae800...  ...        0.0   \n",
       "\n",
       "                                             TCP options  Padding load  \\\n",
       "0      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "1      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "2      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "3      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "4      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "...                                                  ...           ...   \n",
       "52270                                                 []           NaN   \n",
       "52271                                                 []           NaN   \n",
       "52272                                                 []           NaN   \n",
       "52273                                                 []           NaN   \n",
       "52274                                                 []           NaN   \n",
       "\n",
       "      SMB Negotiate Extended Security Response (SMB) load  Skinny len  \\\n",
       "0                                                    NaN          NaN   \n",
       "1                                                    NaN          NaN   \n",
       "2                                                    NaN          NaN   \n",
       "3                                                    NaN          NaN   \n",
       "4                                                    NaN          NaN   \n",
       "...                                                  ...          ...   \n",
       "52270                                                NaN          NaN   \n",
       "52271                                                NaN          NaN   \n",
       "52272                                                NaN          NaN   \n",
       "52273                                                NaN          NaN   \n",
       "52274                                                NaN          NaN   \n",
       "\n",
       "       Skinny res Skinny msg  PPTP len PPTP type PPTP data  \n",
       "0             NaN        NaN       NaN       NaN       NaN  \n",
       "1             NaN        NaN       NaN       NaN       NaN  \n",
       "2             NaN        NaN       NaN       NaN       NaN  \n",
       "3             NaN        NaN       NaN       NaN       NaN  \n",
       "4             NaN        NaN       NaN       NaN       NaN  \n",
       "...           ...        ...       ...       ...       ...  \n",
       "52270         NaN        NaN       NaN       NaN       NaN  \n",
       "52271         NaN        NaN       NaN       NaN       NaN  \n",
       "52272         NaN        NaN       NaN       NaN       NaN  \n",
       "52273         NaN        NaN       NaN       NaN       NaN  \n",
       "52274         NaN        NaN       NaN       NaN       NaN  \n",
       "\n",
       "[52275 rows x 52 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "328f8fe2-d2de-4e55-9266-005f7a908cce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal            18000\n",
       "exploits          10000\n",
       "dos               10000\n",
       "generic            5000\n",
       "fuzzers            5000\n",
       "reconnaissance     2039\n",
       "worms              1530\n",
       "backdoor            252\n",
       "analysis            234\n",
       "shellcode           220\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fdf['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0adafaa2-0d3a-4b8a-ace8-e239696a9993",
   "metadata": {},
   "outputs": [],
   "source": [
    "fdf.to_csv(f'./UNSW/UNSW-3/output1.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81b6d05b-4c43-4994-8678-05942afd6268",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Combining CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c29f805a-71c2-4156-99c3-718543e44b24",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3335661/2771487445.py:1: DtypeWarning: Columns (32,44) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df1 = pd.read_csv('./UNSW/UNSW-3/output1.csv')\n"
     ]
    }
   ],
   "source": [
    "df1 = pd.read_csv('./UNSW/UNSW-3/output1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1e382273-2f1e-4d8c-8843-047983599eb7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>TCP urgptr</th>\n",
       "      <th>TCP options</th>\n",
       "      <th>Padding load</th>\n",
       "      <th>SMB Negotiate Extended Security Response (SMB) load</th>\n",
       "      <th>Skinny len</th>\n",
       "      <th>Skinny res</th>\n",
       "      <th>Skinny msg</th>\n",
       "      <th>PPTP len</th>\n",
       "      <th>PPTP type</th>\n",
       "      <th>PPTP data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.2</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>65090.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049a5ae40...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.421928e+09</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>47518.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc590f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421940e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>5190.0</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>40335.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500004ca61f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.4</td>\n",
       "      <td>55114.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcb9a540...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421943e+09</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>34569.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>521.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000209344240...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52270</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>2137.0</td>\n",
       "      <td>149.171.126.15</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>241.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac500000000000800450000f167d300...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52271</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000034b99d00...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52272</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000034b99d00...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52273</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252.0</td>\n",
       "      <td>1481.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005c91ae800...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52274</th>\n",
       "      <td>1.421934e+09</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>175.45.176.1</td>\n",
       "      <td>37794.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>1481.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005c91ae800...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>52275 rows × 52 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime           srcip    sport           dstip   dsport  \\\n",
       "0      1.421933e+09   149.171.126.2    143.0      59.166.0.6  65090.0   \n",
       "1      1.421928e+09   149.171.126.6     80.0      59.166.0.7  47518.0   \n",
       "2      1.421940e+09   149.171.126.4   5190.0      59.166.0.0  40335.0   \n",
       "3      1.421933e+09   149.171.126.0   6881.0      59.166.0.4  55114.0   \n",
       "4      1.421943e+09   149.171.126.8   6881.0      59.166.0.1  34569.0   \n",
       "...             ...             ...      ...             ...      ...   \n",
       "52270  1.421934e+09    175.45.176.0   2137.0  149.171.126.15     80.0   \n",
       "52271  1.421934e+09    175.45.176.1  37794.0  149.171.126.10   9999.0   \n",
       "52272  1.421934e+09    175.45.176.1  37794.0  149.171.126.10   9999.0   \n",
       "52273  1.421934e+09  149.171.126.10   9999.0    175.45.176.1  37794.0   \n",
       "52274  1.421934e+09  149.171.126.10   9999.0    175.45.176.1  37794.0   \n",
       "\n",
       "      protocol_m   sttl  total_len   first_layer  \\\n",
       "0            tcp   30.0       73.0  cooked linux   \n",
       "1            tcp   30.0     1500.0  cooked linux   \n",
       "2            tcp   29.0       76.0  cooked linux   \n",
       "3            tcp   29.0     1500.0  cooked linux   \n",
       "4            tcp   29.0      521.0  cooked linux   \n",
       "...          ...    ...        ...           ...   \n",
       "52270        tcp  255.0      241.0  cooked linux   \n",
       "52271        tcp  254.0       52.0  cooked linux   \n",
       "52272        tcp  255.0       52.0  cooked linux   \n",
       "52273        tcp  252.0     1481.0  cooked linux   \n",
       "52274        tcp  253.0     1481.0  cooked linux   \n",
       "\n",
       "                                                  packet  ... TCP urgptr  \\\n",
       "0      000000010006001b17059e1c0000080045000049a5ae40...  ...        0.0   \n",
       "1      000000010006001b17059e1c00000800450005dc590f40...  ...        0.0   \n",
       "2      000400010006005056a57763000008004500004ca61f40...  ...        0.0   \n",
       "3      000400010006005056a5776300000800450005dcb9a540...  ...        0.0   \n",
       "4      000400010006005056a577630000080045000209344240...  ...        0.0   \n",
       "...                                                  ...  ...        ...   \n",
       "52270  000000010006021ac500000000000800450000f167d300...  ...        0.0   \n",
       "52271  000400010006005056a524c20000080045000034b99d00...  ...        0.0   \n",
       "52272  000000010006021ac50000000000080045000034b99d00...  ...        0.0   \n",
       "52273  000400010006005056a5776300000800450005c91ae800...  ...        0.0   \n",
       "52274  000000010006001b17059e1c00000800450005c91ae800...  ...        0.0   \n",
       "\n",
       "                                             TCP options  Padding load  \\\n",
       "0      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "1      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "2      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "3      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "4      [('NOP', None), ('NOP', None), ('Timestamp', (...           NaN   \n",
       "...                                                  ...           ...   \n",
       "52270                                                 []           NaN   \n",
       "52271                                                 []           NaN   \n",
       "52272                                                 []           NaN   \n",
       "52273                                                 []           NaN   \n",
       "52274                                                 []           NaN   \n",
       "\n",
       "      SMB Negotiate Extended Security Response (SMB) load  Skinny len  \\\n",
       "0                                                    NaN          NaN   \n",
       "1                                                    NaN          NaN   \n",
       "2                                                    NaN          NaN   \n",
       "3                                                    NaN          NaN   \n",
       "4                                                    NaN          NaN   \n",
       "...                                                  ...          ...   \n",
       "52270                                                NaN          NaN   \n",
       "52271                                                NaN          NaN   \n",
       "52272                                                NaN          NaN   \n",
       "52273                                                NaN          NaN   \n",
       "52274                                                NaN          NaN   \n",
       "\n",
       "       Skinny res Skinny msg  PPTP len PPTP type PPTP data  \n",
       "0             NaN        NaN       NaN       NaN       NaN  \n",
       "1             NaN        NaN       NaN       NaN       NaN  \n",
       "2             NaN        NaN       NaN       NaN       NaN  \n",
       "3             NaN        NaN       NaN       NaN       NaN  \n",
       "4             NaN        NaN       NaN       NaN       NaN  \n",
       "...           ...        ...       ...       ...       ...  \n",
       "52270         NaN        NaN       NaN       NaN       NaN  \n",
       "52271         NaN        NaN       NaN       NaN       NaN  \n",
       "52272         NaN        NaN       NaN       NaN       NaN  \n",
       "52273         NaN        NaN       NaN       NaN       NaN  \n",
       "52274         NaN        NaN       NaN       NaN       NaN  \n",
       "\n",
       "[52275 rows x 52 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "364e90d7-869f-40de-ba53-36925e1cfda1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3335661/227088061.py:1: DtypeWarning: Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df2 = pd.read_csv('./UNSW/UNSW-3/output2.csv')\n"
     ]
    }
   ],
   "source": [
    "df2 = pd.read_csv('./UNSW/UNSW-3/output2.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1fdc62f3-dee3-4281-9152-b41f4b04fe25",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>TCP window</th>\n",
       "      <th>TCP chksum</th>\n",
       "      <th>TCP urgptr</th>\n",
       "      <th>TCP options</th>\n",
       "      <th>Raw load</th>\n",
       "      <th>PPTP len</th>\n",
       "      <th>PPTP type</th>\n",
       "      <th>PPTP data</th>\n",
       "      <th>Padding load</th>\n",
       "      <th>SMB Negotiate Extended Security Response (SMB) load</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421966e+09</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>37893.0</td>\n",
       "      <td>59.166.0.9</td>\n",
       "      <td>12183.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1352.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000548ac8a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>14480.0</td>\n",
       "      <td>0xa276</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>'\\x0e%\\\\xaa\\x12\\x08\\\\xc6\\'\\\\xe0\\\\xcb^\\\\x9e\\\\xd...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.424228e+09</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>34710.0</td>\n",
       "      <td>149.171.126.7</td>\n",
       "      <td>40734.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500004a381340...</td>\n",
       "      <td>...</td>\n",
       "      <td>8688.0</td>\n",
       "      <td>0xa3a6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>'\\\\xe3\\x11\\x00\\x00\\x00O[\\x7f\\x1ả\\\\xb1\\\\\\\\x92\\...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421968e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>10602.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcd96340...</td>\n",
       "      <td>...</td>\n",
       "      <td>7240.0</td>\n",
       "      <td>0xb790</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>'S\\\\xdaI]\\x18\\\\xc4\\\\xd6&gt;s\\\\xf5\\\\xc5\\\\xe25\\\\x8d...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421972e+09</td>\n",
       "      <td>59.166.0.4</td>\n",
       "      <td>36158.0</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>143.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>31.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000044415a40...</td>\n",
       "      <td>...</td>\n",
       "      <td>18824.0</td>\n",
       "      <td>0x6fdf</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>'7 lsub \"\" \"~*\"\\r\\n'</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421968e+09</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>39224.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc589640...</td>\n",
       "      <td>...</td>\n",
       "      <td>7240.0</td>\n",
       "      <td>0x5143</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>'ɑ\\x10\\\\xbd(#\\\\\\\\xe8\\\\xb5K\\\\x97\\\\x93\\\\xb1\\x05\\...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82673</th>\n",
       "      <td>1.424234e+09</td>\n",
       "      <td>149.171.126.11</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>65359.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>1481.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005c9a0f400...</td>\n",
       "      <td>...</td>\n",
       "      <td>16383.0</td>\n",
       "      <td>0xc052</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'ForCed EnTrY 1.49.2\\r\\n\\r\\n\\r\\nConnection Sta...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82674</th>\n",
       "      <td>1.424234e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>65359.0</td>\n",
       "      <td>149.171.126.11</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003474d000...</td>\n",
       "      <td>...</td>\n",
       "      <td>16383.0</td>\n",
       "      <td>0x76c</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'GETOSVERSION'</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82675</th>\n",
       "      <td>1.424234e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>65359.0</td>\n",
       "      <td>149.171.126.11</td>\n",
       "      <td>9999.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003474d000...</td>\n",
       "      <td>...</td>\n",
       "      <td>16383.0</td>\n",
       "      <td>0x76c</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'GETOSVERSION'</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82676</th>\n",
       "      <td>1.424234e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>33471.0</td>\n",
       "      <td>175.45.176.3</td>\n",
       "      <td>35817.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000080f85600...</td>\n",
       "      <td>...</td>\n",
       "      <td>16383.0</td>\n",
       "      <td>0x4c8d</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'Microsoft Windows XP [Version 5.1.2600]\\r\\n(C...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82677</th>\n",
       "      <td>1.424234e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>33471.0</td>\n",
       "      <td>175.45.176.3</td>\n",
       "      <td>35817.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000080f85600...</td>\n",
       "      <td>...</td>\n",
       "      <td>16383.0</td>\n",
       "      <td>0x4c8d</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'Microsoft Windows XP [Version 5.1.2600]\\r\\n(C...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>82678 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime           srcip    sport           dstip   dsport  \\\n",
       "0      1.421966e+09   149.171.126.6  37893.0      59.166.0.9  12183.0   \n",
       "1      1.424228e+09      59.166.0.0  34710.0   149.171.126.7  40734.0   \n",
       "2      1.421968e+09   149.171.126.4     80.0      59.166.0.1  10602.0   \n",
       "3      1.421972e+09      59.166.0.4  36158.0   149.171.126.0    143.0   \n",
       "4      1.421968e+09   149.171.126.5     80.0      59.166.0.2  39224.0   \n",
       "...             ...             ...      ...             ...      ...   \n",
       "82673  1.424234e+09  149.171.126.11   9999.0    175.45.176.0  65359.0   \n",
       "82674  1.424234e+09    175.45.176.0  65359.0  149.171.126.11   9999.0   \n",
       "82675  1.424234e+09    175.45.176.0  65359.0  149.171.126.11   9999.0   \n",
       "82676  1.424234e+09  149.171.126.12  33471.0    175.45.176.3  35817.0   \n",
       "82677  1.424234e+09  149.171.126.12  33471.0    175.45.176.3  35817.0   \n",
       "\n",
       "      protocol_m   sttl  total_len   first_layer  \\\n",
       "0            tcp   29.0     1352.0  cooked linux   \n",
       "1            tcp   31.0       74.0  cooked linux   \n",
       "2            tcp   30.0     1500.0  cooked linux   \n",
       "3            tcp   31.0       68.0  cooked linux   \n",
       "4            tcp   30.0     1500.0  cooked linux   \n",
       "...          ...    ...        ...           ...   \n",
       "82673        tcp  253.0     1481.0  cooked linux   \n",
       "82674        tcp  255.0       52.0  cooked linux   \n",
       "82675        tcp  254.0       52.0  cooked linux   \n",
       "82676        tcp  253.0      128.0  cooked linux   \n",
       "82677        tcp  252.0      128.0  cooked linux   \n",
       "\n",
       "                                                  packet  ... TCP window  \\\n",
       "0      000400010006005056a577630000080045000548ac8a40...  ...    14480.0   \n",
       "1      000400010006005056a524c2000008004500004a381340...  ...     8688.0   \n",
       "2      000000010006001b17059e1c00000800450005dcd96340...  ...     7240.0   \n",
       "3      000400010006005056a524c20000080045000044415a40...  ...    18824.0   \n",
       "4      000000010006001b17059e1c00000800450005dc589640...  ...     7240.0   \n",
       "...                                                  ...  ...        ...   \n",
       "82673  000000010006001b17059e1c00000800450005c9a0f400...  ...    16383.0   \n",
       "82674  000000010006021ac5000000000008004500003474d000...  ...    16383.0   \n",
       "82675  000400010006005056a524c2000008004500003474d000...  ...    16383.0   \n",
       "82676  000000010006001b17059e1c0000080045000080f85600...  ...    16383.0   \n",
       "82677  000400010006005056a577630000080045000080f85600...  ...    16383.0   \n",
       "\n",
       "       TCP chksum  TCP urgptr  \\\n",
       "0          0xa276         0.0   \n",
       "1          0xa3a6         0.0   \n",
       "2          0xb790         0.0   \n",
       "3          0x6fdf         0.0   \n",
       "4          0x5143         0.0   \n",
       "...           ...         ...   \n",
       "82673      0xc052         0.0   \n",
       "82674       0x76c         0.0   \n",
       "82675       0x76c         0.0   \n",
       "82676      0x4c8d         0.0   \n",
       "82677      0x4c8d         0.0   \n",
       "\n",
       "                                             TCP options  \\\n",
       "0      [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "1      [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "2      [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "3      [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "4      [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "...                                                  ...   \n",
       "82673                                                 []   \n",
       "82674                                                 []   \n",
       "82675                                                 []   \n",
       "82676                                                 []   \n",
       "82677                                                 []   \n",
       "\n",
       "                                                Raw load  PPTP len PPTP type  \\\n",
       "0      '\\x0e%\\\\xaa\\x12\\x08\\\\xc6\\'\\\\xe0\\\\xcb^\\\\x9e\\\\xd...       NaN       NaN   \n",
       "1      '\\\\xe3\\x11\\x00\\x00\\x00O[\\x7f\\x1ả\\\\xb1\\\\\\\\x92\\...       NaN       NaN   \n",
       "2      'S\\\\xdaI]\\x18\\\\xc4\\\\xd6>s\\\\xf5\\\\xc5\\\\xe25\\\\x8d...       NaN       NaN   \n",
       "3                                   '7 lsub \"\" \"~*\"\\r\\n'       NaN       NaN   \n",
       "4      'ɑ\\x10\\\\xbd(#\\\\\\\\xe8\\\\xb5K\\\\x97\\\\x93\\\\xb1\\x05\\...       NaN       NaN   \n",
       "...                                                  ...       ...       ...   \n",
       "82673  'ForCed EnTrY 1.49.2\\r\\n\\r\\n\\r\\nConnection Sta...       NaN       NaN   \n",
       "82674                                     'GETOSVERSION'       NaN       NaN   \n",
       "82675                                     'GETOSVERSION'       NaN       NaN   \n",
       "82676  'Microsoft Windows XP [Version 5.1.2600]\\r\\n(C...       NaN       NaN   \n",
       "82677  'Microsoft Windows XP [Version 5.1.2600]\\r\\n(C...       NaN       NaN   \n",
       "\n",
       "       PPTP data Padding load  \\\n",
       "0            NaN          NaN   \n",
       "1            NaN          NaN   \n",
       "2            NaN          NaN   \n",
       "3            NaN          NaN   \n",
       "4            NaN          NaN   \n",
       "...          ...          ...   \n",
       "82673        NaN          NaN   \n",
       "82674        NaN          NaN   \n",
       "82675        NaN          NaN   \n",
       "82676        NaN          NaN   \n",
       "82677        NaN          NaN   \n",
       "\n",
       "      SMB Negotiate Extended Security Response (SMB) load  \n",
       "0                                                    NaN   \n",
       "1                                                    NaN   \n",
       "2                                                    NaN   \n",
       "3                                                    NaN   \n",
       "4                                                    NaN   \n",
       "...                                                  ...   \n",
       "82673                                                NaN   \n",
       "82674                                                NaN   \n",
       "82675                                                NaN   \n",
       "82676                                                NaN   \n",
       "82677                                                NaN   \n",
       "\n",
       "[82678 rows x 49 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8723dda0-b254-45eb-ba4a-8e82dd5bfbc4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3335661/1559608217.py:1: DtypeWarning: Columns (43,44,50,51,54,55,56,57,58,59,60,64,65) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df3 = pd.read_csv('./UNSW/UNSW-3/output3.csv')\n"
     ]
    }
   ],
   "source": [
    "df3 = pd.read_csv('./UNSW/UNSW-3/output3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c7472d17-c9f1-4c40-90d7-9dab686682d3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>DCE/RPC v5 - Bind reserved</th>\n",
       "      <th>DCE/RPC v5 - Bind if_uuid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.424242e+09</td>\n",
       "      <td>149.171.126.1</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>6394.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dcb06f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.424257e+09</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>53858.0</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>143.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>32.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac50000000000080045000047cca140...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.424250e+09</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>56762.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcc63540...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.424254e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.2</td>\n",
       "      <td>52183.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcb7a640...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.424248e+09</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>35358.0</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>7838.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450001953c2440...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152811</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.2</td>\n",
       "      <td>20503.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>33532.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>313.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000139a19e00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152812</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000094043500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152813</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000094043500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152814</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003603ad00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152815</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003603ad00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>152816 rows × 66 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime           srcip    sport           dstip   dsport  \\\n",
       "0       1.424242e+09   149.171.126.1   6881.0      59.166.0.7   6394.0   \n",
       "1       1.424257e+09      59.166.0.0  53858.0   149.171.126.5    143.0   \n",
       "2       1.424250e+09   149.171.126.9   6881.0      59.166.0.0  56762.0   \n",
       "3       1.424254e+09   149.171.126.0   6881.0      59.166.0.2  52183.0   \n",
       "4       1.424248e+09   149.171.126.8  35358.0      59.166.0.1   7838.0   \n",
       "...              ...             ...      ...             ...      ...   \n",
       "152811  1.424262e+09    175.45.176.2  20503.0  149.171.126.10  33532.0   \n",
       "152812  1.424262e+09  149.171.126.12   1014.0    175.45.176.0  60251.0   \n",
       "152813  1.424262e+09  149.171.126.12   1014.0    175.45.176.0  60251.0   \n",
       "152814  1.424262e+09    175.45.176.0  60251.0  149.171.126.12   1014.0   \n",
       "152815  1.424262e+09    175.45.176.0  60251.0  149.171.126.12   1014.0   \n",
       "\n",
       "       protocol_m   sttl  total_len   first_layer  \\\n",
       "0             tcp   30.0     1500.0  cooked linux   \n",
       "1             tcp   32.0       71.0  cooked linux   \n",
       "2             tcp   29.0     1500.0  cooked linux   \n",
       "3             tcp   29.0     1500.0  cooked linux   \n",
       "4             tcp   29.0      405.0  cooked linux   \n",
       "...           ...    ...        ...           ...   \n",
       "152811        tcp  254.0      313.0  cooked linux   \n",
       "152812        tcp  253.0      148.0  cooked linux   \n",
       "152813        tcp  252.0      148.0  cooked linux   \n",
       "152814        tcp  255.0       54.0  cooked linux   \n",
       "152815        tcp  254.0       54.0  cooked linux   \n",
       "\n",
       "                                                   packet  ...  \\\n",
       "0       000000010006001b17059e1c00000800450005dcb06f40...  ...   \n",
       "1       000000010006021ac50000000000080045000047cca140...  ...   \n",
       "2       000400010006005056a5776300000800450005dcc63540...  ...   \n",
       "3       000400010006005056a5776300000800450005dcb7a640...  ...   \n",
       "4       000400010006005056a5776300000800450001953c2440...  ...   \n",
       "...                                                   ...  ...   \n",
       "152811  000400010006005056a524c20000080045000139a19e00...  ...   \n",
       "152812  000000010006001b17059e1c0000080045000094043500...  ...   \n",
       "152813  000400010006005056a577630000080045000094043500...  ...   \n",
       "152814  000000010006021ac5000000000008004500003603ad00...  ...   \n",
       "152815  000400010006005056a524c2000008004500003603ad00...  ...   \n",
       "\n",
       "       DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  DCE/RPC v5 endian  \\\n",
       "0                   NaN                   NaN                NaN   \n",
       "1                   NaN                   NaN                NaN   \n",
       "2                   NaN                   NaN                NaN   \n",
       "3                   NaN                   NaN                NaN   \n",
       "4                   NaN                   NaN                NaN   \n",
       "...                 ...                   ...                ...   \n",
       "152811              NaN                   NaN                NaN   \n",
       "152812              NaN                   NaN                NaN   \n",
       "152813              NaN                   NaN                NaN   \n",
       "152814              NaN                   NaN                NaN   \n",
       "152815              NaN                   NaN                NaN   \n",
       "\n",
       "       DCE/RPC v5 encoding  DCE/RPC v5 float  DCE/RPC v5 frag_len  \\\n",
       "0                      NaN               NaN                  NaN   \n",
       "1                      NaN               NaN                  NaN   \n",
       "2                      NaN               NaN                  NaN   \n",
       "3                      NaN               NaN                  NaN   \n",
       "4                      NaN               NaN                  NaN   \n",
       "...                    ...               ...                  ...   \n",
       "152811                 NaN               NaN                  NaN   \n",
       "152812                 NaN               NaN                  NaN   \n",
       "152813                 NaN               NaN                  NaN   \n",
       "152814                 NaN               NaN                  NaN   \n",
       "152815                 NaN               NaN                  NaN   \n",
       "\n",
       "       DCE/RPC v5 auth_len  DCE/RPC v5 call_id DCE/RPC v5 - Bind reserved  \\\n",
       "0                      NaN                 NaN                        NaN   \n",
       "1                      NaN                 NaN                        NaN   \n",
       "2                      NaN                 NaN                        NaN   \n",
       "3                      NaN                 NaN                        NaN   \n",
       "4                      NaN                 NaN                        NaN   \n",
       "...                    ...                 ...                        ...   \n",
       "152811                 NaN                 NaN                        NaN   \n",
       "152812                 NaN                 NaN                        NaN   \n",
       "152813                 NaN                 NaN                        NaN   \n",
       "152814                 NaN                 NaN                        NaN   \n",
       "152815                 NaN                 NaN                        NaN   \n",
       "\n",
       "       DCE/RPC v5 - Bind if_uuid  \n",
       "0                            NaN  \n",
       "1                            NaN  \n",
       "2                            NaN  \n",
       "3                            NaN  \n",
       "4                            NaN  \n",
       "...                          ...  \n",
       "152811                       NaN  \n",
       "152812                       NaN  \n",
       "152813                       NaN  \n",
       "152814                       NaN  \n",
       "152815                       NaN  \n",
       "\n",
       "[152816 rows x 66 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f301b982-9ac3-43c0-a16a-ece63bdfdd97",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df1, df2, df3], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "633f6934-99ea-4157-95af-2958694ba441",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>DCE/RPC v5 - Bind reserved</th>\n",
       "      <th>DCE/RPC v5 - Bind if_uuid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.2</td>\n",
       "      <td>143.0</td>\n",
       "      <td>59.166.0.6</td>\n",
       "      <td>65090.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000049a5ae40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.421928e+09</td>\n",
       "      <td>149.171.126.6</td>\n",
       "      <td>80.0</td>\n",
       "      <td>59.166.0.7</td>\n",
       "      <td>47518.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c00000800450005dc590f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.421940e+09</td>\n",
       "      <td>149.171.126.4</td>\n",
       "      <td>5190.0</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>40335.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a57763000008004500004ca61f40...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.421933e+09</td>\n",
       "      <td>149.171.126.0</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.4</td>\n",
       "      <td>55114.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a5776300000800450005dcb9a540...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.421943e+09</td>\n",
       "      <td>149.171.126.8</td>\n",
       "      <td>6881.0</td>\n",
       "      <td>59.166.0.1</td>\n",
       "      <td>34569.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>29.0</td>\n",
       "      <td>521.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000209344240...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287764</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.2</td>\n",
       "      <td>20503.0</td>\n",
       "      <td>149.171.126.10</td>\n",
       "      <td>33532.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>313.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c20000080045000139a19e00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287765</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>253.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006001b17059e1c0000080045000094043500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287766</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>252.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a577630000080045000094043500...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287767</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>255.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000000010006021ac5000000000008004500003603ad00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287768</th>\n",
       "      <td>1.424262e+09</td>\n",
       "      <td>175.45.176.0</td>\n",
       "      <td>60251.0</td>\n",
       "      <td>149.171.126.12</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>254.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>cooked linux</td>\n",
       "      <td>000400010006005056a524c2000008004500003603ad00...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>287769 rows × 66 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime           srcip    sport           dstip   dsport  \\\n",
       "0       1.421933e+09   149.171.126.2    143.0      59.166.0.6  65090.0   \n",
       "1       1.421928e+09   149.171.126.6     80.0      59.166.0.7  47518.0   \n",
       "2       1.421940e+09   149.171.126.4   5190.0      59.166.0.0  40335.0   \n",
       "3       1.421933e+09   149.171.126.0   6881.0      59.166.0.4  55114.0   \n",
       "4       1.421943e+09   149.171.126.8   6881.0      59.166.0.1  34569.0   \n",
       "...              ...             ...      ...             ...      ...   \n",
       "287764  1.424262e+09    175.45.176.2  20503.0  149.171.126.10  33532.0   \n",
       "287765  1.424262e+09  149.171.126.12   1014.0    175.45.176.0  60251.0   \n",
       "287766  1.424262e+09  149.171.126.12   1014.0    175.45.176.0  60251.0   \n",
       "287767  1.424262e+09    175.45.176.0  60251.0  149.171.126.12   1014.0   \n",
       "287768  1.424262e+09    175.45.176.0  60251.0  149.171.126.12   1014.0   \n",
       "\n",
       "       protocol_m   sttl  total_len   first_layer  \\\n",
       "0             tcp   30.0       73.0  cooked linux   \n",
       "1             tcp   30.0     1500.0  cooked linux   \n",
       "2             tcp   29.0       76.0  cooked linux   \n",
       "3             tcp   29.0     1500.0  cooked linux   \n",
       "4             tcp   29.0      521.0  cooked linux   \n",
       "...           ...    ...        ...           ...   \n",
       "287764        tcp  254.0      313.0  cooked linux   \n",
       "287765        tcp  253.0      148.0  cooked linux   \n",
       "287766        tcp  252.0      148.0  cooked linux   \n",
       "287767        tcp  255.0       54.0  cooked linux   \n",
       "287768        tcp  254.0       54.0  cooked linux   \n",
       "\n",
       "                                                   packet  ...  \\\n",
       "0       000000010006001b17059e1c0000080045000049a5ae40...  ...   \n",
       "1       000000010006001b17059e1c00000800450005dc590f40...  ...   \n",
       "2       000400010006005056a57763000008004500004ca61f40...  ...   \n",
       "3       000400010006005056a5776300000800450005dcb9a540...  ...   \n",
       "4       000400010006005056a577630000080045000209344240...  ...   \n",
       "...                                                   ...  ...   \n",
       "287764  000400010006005056a524c20000080045000139a19e00...  ...   \n",
       "287765  000000010006001b17059e1c0000080045000094043500...  ...   \n",
       "287766  000400010006005056a577630000080045000094043500...  ...   \n",
       "287767  000000010006021ac5000000000008004500003603ad00...  ...   \n",
       "287768  000400010006005056a524c2000008004500003603ad00...  ...   \n",
       "\n",
       "       DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  DCE/RPC v5 endian  \\\n",
       "0                   NaN                   NaN                NaN   \n",
       "1                   NaN                   NaN                NaN   \n",
       "2                   NaN                   NaN                NaN   \n",
       "3                   NaN                   NaN                NaN   \n",
       "4                   NaN                   NaN                NaN   \n",
       "...                 ...                   ...                ...   \n",
       "287764              NaN                   NaN                NaN   \n",
       "287765              NaN                   NaN                NaN   \n",
       "287766              NaN                   NaN                NaN   \n",
       "287767              NaN                   NaN                NaN   \n",
       "287768              NaN                   NaN                NaN   \n",
       "\n",
       "       DCE/RPC v5 encoding  DCE/RPC v5 float  DCE/RPC v5 frag_len  \\\n",
       "0                      NaN               NaN                  NaN   \n",
       "1                      NaN               NaN                  NaN   \n",
       "2                      NaN               NaN                  NaN   \n",
       "3                      NaN               NaN                  NaN   \n",
       "4                      NaN               NaN                  NaN   \n",
       "...                    ...               ...                  ...   \n",
       "287764                 NaN               NaN                  NaN   \n",
       "287765                 NaN               NaN                  NaN   \n",
       "287766                 NaN               NaN                  NaN   \n",
       "287767                 NaN               NaN                  NaN   \n",
       "287768                 NaN               NaN                  NaN   \n",
       "\n",
       "       DCE/RPC v5 auth_len  DCE/RPC v5 call_id DCE/RPC v5 - Bind reserved  \\\n",
       "0                      NaN                 NaN                        NaN   \n",
       "1                      NaN                 NaN                        NaN   \n",
       "2                      NaN                 NaN                        NaN   \n",
       "3                      NaN                 NaN                        NaN   \n",
       "4                      NaN                 NaN                        NaN   \n",
       "...                    ...                 ...                        ...   \n",
       "287764                 NaN                 NaN                        NaN   \n",
       "287765                 NaN                 NaN                        NaN   \n",
       "287766                 NaN                 NaN                        NaN   \n",
       "287767                 NaN                 NaN                        NaN   \n",
       "287768                 NaN                 NaN                        NaN   \n",
       "\n",
       "       DCE/RPC v5 - Bind if_uuid  \n",
       "0                            NaN  \n",
       "1                            NaN  \n",
       "2                            NaN  \n",
       "3                            NaN  \n",
       "4                            NaN  \n",
       "...                          ...  \n",
       "287764                       NaN  \n",
       "287765                       NaN  \n",
       "287766                       NaN  \n",
       "287767                       NaN  \n",
       "287768                       NaN  \n",
       "\n",
       "[287769 rows x 66 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "01fc40bc-8239-4c43-ab58-986567d9f9a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('./UNSW/UNSW-4/output.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b58b18f4-8208-4053-b4dc-d7ce84832b7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (32,33,34,39,47,52,53,54,55,56,57,60,61,62,74,75,76,77,79,81,82,83,84,85,86,87,93,94,95,97,98,99,100,105,107,110,113,116,119,122,126,139,140,146,157,160,162,164,166,169,170,172,173,175,176,184,185,188,190,191,192,193,194,195,196,197,198,202,230,231,232,233,234,235,246,247,254,256,259,260,261,270,304,305,306,307,308,309,310) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('./UNSW/UNSW-4/output.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fb9f4de5-fc7f-49fa-aa45-67a9d032a1cc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normal            50000\n",
      "exploits          50000\n",
      "dos               50000\n",
      "generic           50000\n",
      "fuzzers           50000\n",
      "reconnaissance    22571\n",
      "worms              9970\n",
      "analysis           1896\n",
      "backdoor           1834\n",
      "shellcode          1498\n",
      "Name: attack_cat, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df['attack_cat'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a5061a6-c244-470e-a51d-6e7a61ece876",
   "metadata": {
    "tags": []
   },
   "source": [
    "# EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95ef7d79-c471-478c-b701-33eba8e96b45",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3711001/2456257565.py:7: DtypeWarning: Columns (33,34,39,44,47,52,53,54,55,56,57,59,60,61,62,63,64,74,75,78,79,80,81,82,83,84,85,86,88,90,91,92,93,94,95,96,98,99,100,103,105,106,107,108,109,110,111,112,113,115,116,117,118,123,125,128,131,134,137,140,141,142,143,144,150,157,158,164,168,171,174,175,176,177,179,181,182,187,188,191,193,194,195,197,200,201,203,204,205,206,208,210,211,212,213,215,217,218,222,224,225,230,236,237,238,239,240,243,245,246,247,248,249,250,251,252,253,257,258,261,263,265,266,281,282,283,286,287,288,289,290,291,296,299,300,301,302,303,308,309,310,311,312,315,316,317,318,319,320,321,323,324,326,328,329,330,331,338,339,342,343,344,345,347,350,352,353,361,362,363,364,365,366,368,369,378,383,384,385,386) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
      "/tmp/ipykernel_3711001/2456257565.py:7: DtypeWarning: Columns (32,33,44,45,46,48,51,59,64,65,66,67,68,69,70,71,72,75,76,82,83,84,87,88,89,90,91,92,95,96,98,100,101,103,104,105,106,107,108,109,112,115,116,117,118,120,122,123,124,125,130,131,133,135,136,137,138,140,146,147,148,149,150,151,153,154,155,158,159,160,162,163,165,166,168,169,170,171,173,175,176,177,178,180,181,182,183,184,185,186,188,190,191,192,195,196,198,199,201,204,206,208,210,213,216,219,222,224,227,230,232,233,234,235,238,239,240,243,245,246,247,248,249,250,251,252,253,254,255,256,257,260,261,264,265,266,267,268,269,270,275,277,280,281,282,284,288,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,309,321,322,323) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
      "/tmp/ipykernel_3711001/2456257565.py:7: DtypeWarning: Columns (75,76,79,80,81,82,83,84,89,92,93,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
      "/tmp/ipykernel_3711001/2456257565.py:7: DtypeWarning: Columns (45,66,69,70,71,74,75,76,79,80,81,82,83,84,85,97,98,99,114,119,120,121,124,131,134,135,136,137,138,139) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
      "/tmp/ipykernel_3711001/2456257565.py:7: DtypeWarning: Columns (75,76,79,80,81,82,83,84,85,87,88,92,96,99,100,101,102,109,110,113) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')\n"
     ]
    }
   ],
   "source": [
    "k = 1\n",
    "df = pd.DataFrame()\n",
    "for i in range(0,20):\n",
    "    if not os.path.isfile(f'./UNSW/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Reading CSV File {i} ----->>>>>>>>')\n",
    "    temp_df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
    "    df = pd.concat([df, temp_df], ignore_index=True)\n",
    "    write_log(f'------------ CSV File {i} added to DataFrame ------------')\n",
    "    if i%6 == 0:\n",
    "        df.to_csv(f'./UNSW/UNSW-EDA/output{k}.csv', index=False)\n",
    "        print(df['attack_cat'].value_counts())\n",
    "        df = pd.DataFrame()\n",
    "        k = k+1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "355c60db-a3bb-40cc-b0ab-303576056bed",
   "metadata": {},
   "source": [
    "# Exporting Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28500b50-ed02-4531-b41b-4d8fcddb73e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3613372/3859504502.py:17: DtypeWarning: Columns (33,34,39,44,47,52,53,54,55,56,57,59,60,61,62,63,64,74,75,78,79,80,81,82,83,84,85,86,88,90,91,92,93,94,95,96,98,99,100,103,105,106,107,108,109,110,111,112,113,115,116,117,118,123,125,128,131,134,137,140,141,142,143,144,150,157,158,164,168,171,174,175,176,177,179,181,182,187,188,191,193,194,195,197,200,201,203,204,205,206,208,210,211,212,213,215,217,218,222,224,225,230,236,237,238,239,240,243,245,246,247,248,249,250,251,252,253,257,258,261,263,265,266,281,282,283,286,287,288,289,290,291,296,299,300,301,302,303,308,309,310,311,312,315,316,317,318,319,320,321,323,324,326,328,329,330,331,338,339,342,343,344,345,347,350,352,353,361,362,363,364,365,366,368,369,378,383,384,385,386) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
      "IOStream.flush timed out\n",
      "/tmp/ipykernel_3613372/3859504502.py:26: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df.insert(0, 'flow_id', flow_id)\n",
      "/tmp/ipykernel_3613372/3859504502.py:28: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['packet_id'] = range(start_value, end_value)\n",
      "/tmp/ipykernel_3613372/3859504502.py:29: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['payload_length'] = df.payload_hex.apply(lambda x: len(x)//2 if isinstance(x, str) else 0)\n"
     ]
    }
   ],
   "source": [
    "def hex_to_dec(hex_str):\n",
    "    return [int(hex_str[i:i+2], 16) for i in range(0, len(hex_str), 2)]\n",
    "\n",
    "start_value = 1\n",
    "flow1 = pd.read_csv('./UNSW/Export/UNSW_Flow.csv')\n",
    "flow1 = flow1[['flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']]\n",
    "flow2 = flow1.copy()\n",
    "flow2.rename(columns={'source_ip': 'destination_ip', 'destination_ip': 'source_ip', 'source_port': 'destination_port', 'destination_port': 'source_port'}, inplace=True)\n",
    "flow = pd.concat([flow1, flow2])\n",
    "flow.drop_duplicates(subset=flow.columns.difference(['flow_id']), inplace=True)\n",
    "\n",
    "for i in range(20):\n",
    "    \n",
    "    if not os.path.isfile(f'./UNSW/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>\\n')\n",
    "    df = pd.read_csv(f'./UNSW/output{i}.csv')\n",
    "    \n",
    "    write_log(f'------------ BEGIN PACKET FLOW MERGE ------------')\n",
    "    df.rename(columns={'packet': 'packet_hex', 'payload': 'payload_hex', 'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'protocol_m': 'protocol'}, inplace=True)\n",
    "    df['source_port'] = df['source_port'].astype(int)\n",
    "    df['destination_port'] = df['destination_port'].astype(int)\n",
    "    columns_to_match = ['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']\n",
    "    df = df.merge(flow, on=columns_to_match, how='left')\n",
    "    flow_id = df.pop('flow_id')\n",
    "    df.insert(0, 'flow_id', flow_id)\n",
    "    end_value = start_value + len(df)\n",
    "    df['packet_id'] = range(start_value, end_value)\n",
    "    df['payload_length'] = df.payload_hex.apply(lambda x: len(x)//2 if isinstance(x, str) else 0)\n",
    "    write_log(f'------------ END PACKET FLOW MERGE ------------')\n",
    "    \n",
    "    write_log(f'------------ BEGIN PAYLOAD BYTES PROCESSING ------------')\n",
    "    df1 = df.dropna(subset='Raw load')\n",
    "    df1.reset_index(drop=True, inplace=True)\n",
    "    dec_data = df1['payload_hex'].apply(hex_to_dec)\n",
    "    max_len = dec_data.apply(len).max()\n",
    "    df_final = pd.DataFrame(dec_data.tolist(), columns=[f'payload_byte_{i}' for i in range(1,max_len+1)])\n",
    "    df_final = pd.concat([df1[['packet_id', 'flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'payload_length']], df_final], axis=1)\n",
    "    df_final['attack_label'] = df1['attack_cat']\n",
    "    df_final.drop_duplicates(subset=df_final.columns.difference(['packet_id', 'flow_id']), inplace=True)\n",
    "    write_log(f'------------ END PAYLOAD BYTES PROCESSING ------------')\n",
    "    \n",
    "    df_final.to_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Saved for Payload Bytes ------------')\n",
    "    \n",
    "    del dec_data\n",
    "    del df_final\n",
    "    del df1\n",
    "    \n",
    "    write_log(f'------------ BEGIN PACKET BYTES PROCESSING ------------')\n",
    "    dec_data = df['packet_hex'].apply(hex_to_dec)\n",
    "    max_len = dec_data.apply(len).max()\n",
    "    df_final = pd.DataFrame(dec_data.tolist(), columns=[f'packet_byte_{i}' for i in range(1,max_len+1)])\n",
    "    df_final = pd.concat([df[['packet_id', 'flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'payload_length']], df_final], axis=1)\n",
    "    df_final['attack_label'] = df['attack_cat']\n",
    "    df_final.drop_duplicates(subset=df_final.columns.difference(['packet_id', 'flow_id']), inplace=True)\n",
    "    write_log(f'------------ END PACKET BYTES PROCESSING ------------')\n",
    "    \n",
    "    df_final.to_csv(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Saved for Packet Bytes ------------')\n",
    "    \n",
    "    del dec_data\n",
    "    del df_final\n",
    "    \n",
    "    write_log(f'------------ BEGIN PACKET FIELDS PROCESSING ------------')\n",
    "    df.drop(['stime', 'sttl', 'total_len', 'first_layer', 't_delta', 'stime_flow', 'label', 'ltime_max', 'payload_length'], axis=1, inplace=True)\n",
    "    attack_label = df.pop('attack_cat')\n",
    "    df.insert(len(df.columns), 'attack_label', attack_label)\n",
    "    packet_id = df.pop('packet_id')\n",
    "    df.insert(0, 'packet_id', packet_id)\n",
    "    df.drop_duplicates(subset=df.columns.difference(['packet_id', 'flow_id']), inplace=True)\n",
    "    start_value += len(df)\n",
    "    counts = '\\n'.join([f'{key}:{value}' for key, value in df.attack_label.value_counts().to_dict().items()])\n",
    "    f = open(\"UNSW_INFO.txt\", \"a\")\n",
    "    f.write(f'TOTAL PACKETS IN CSV FILE {i}: {df.shape}\\n')\n",
    "    f.write(f'ATTACK LABELS IN CSV FILE {i}\\n' + counts + '\\n\\n')\n",
    "    f.close()\n",
    "    write_log(f'------------ END PACKET FIELDS PROCESSING ------------')\n",
    "    \n",
    "    df.to_csv(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Saved for Packet Fields ------------\\n')\n",
    "    \n",
    "    del df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4aee0e7d-6770-4859-a1e2-adf582bf270e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_2.csv (deflated 84%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_2.csv (deflated 90%)\n",
      "  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_2.csv (deflated 82%)\n",
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_3.csv (deflated 85%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_3.csv (deflated 90%)\n",
      "  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_3.csv (deflated 82%)\n",
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_4.csv (deflated 84%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_4.csv (deflated 89%)\n",
      "  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_4.csv (deflated 81%)\n",
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_5.csv (deflated 84%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_5.csv (deflated 90%)\n",
      "  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_5.csv (deflated 81%)\n",
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_6.csv (deflated 84%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_6.csv (deflated 90%)\n",
      "  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_6.csv (deflated 81%)\n",
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_7.csv (deflated 85%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_7.csv (deflated 90%)\n",
      "  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_7.csv (deflated 82%)\n",
      "  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_8.csv (deflated 84%)\n",
      "  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_8.csv"
     ]
    }
   ],
   "source": [
    "for i in range(2,19):\n",
    "    file_name_1 = f\"./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv\"\n",
    "    zip_file_name_1 = f\"./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}\"\n",
    "    !zip {zip_file_name_1} {file_name_1}\n",
    "    \n",
    "    file_name_2 = f\"./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv\"\n",
    "    zip_file_name_2 = f\"./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}\"\n",
    "    !zip {zip_file_name_2} {file_name_2}\n",
    "    \n",
    "    file_name_3 = f\"./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv\"\n",
    "    zip_file_name_3 = f\"./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}\"\n",
    "    !zip {zip_file_name_3} {file_name_3}\n",
    "    \n",
    "    write_log(f\"FILE {i} ZIPPED SUCCESSFULLY\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4674e46-8901-4580-a721-8ee07e1de3ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(2,19):\n",
    "    file_name_1 = f\"./UNSW/Export/Payload-Bytes/UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.zip\"\n",
    "    !unzip {file_name_1}\n",
    "    \n",
    "    file_name_2 = f\"./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.zip\"\n",
    "    !unzip {file_name_2}\n",
    "    \n",
    "    file_name_3 = f\"./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.zip\"\n",
    "    !unzip {file_name_3}\n",
    "    \n",
    "    write_log(f\"FILE {i} UNZIPPED SUCCESSFULLY\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03201bbe-51d8-412c-81b5-baef663629be",
   "metadata": {},
   "source": [
    "# Upload Files to Hugging Face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "baffd2c6-028f-4794-88e9-a76240f65ee7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import login, HfApi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f6f8d3e4-710e-4efb-ae7b-05e30d05e08d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5e3e21b00b5142ea80a56700fc80117f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bcc3fccd-f469-44c2-a7aa-c07cfc9c5fa5",
   "metadata": {},
   "outputs": [],
   "source": [
    "api = HfApi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "06031554-7f79-45cf-bea8-b04fad7bbb7b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/rajkumardheivanayahi/.local/lib/python3.9/site-packages/huggingface_hub/utils/_experimental.py:57: UserWarning: 'plan_multi_commits' is experimental and might be subject to breaking changes in the future. You can disable this warning by setting `HF_HUB_DISABLE_EXPERIMENTAL_WARNING=1` as environment variable.\n",
      "  warnings.warn(\n",
      "/data/rajkumardheivanayahi/.local/lib/python3.9/site-packages/huggingface_hub/utils/_experimental.py:57: UserWarning: 'HfApi.create_commits_on_pr' is experimental and might be subject to breaking changes in the future. You can disable this warning by setting `HF_HUB_DISABLE_EXPERIMENTAL_WARNING=1` as environment variable.\n",
      "  warnings.warn(\n",
      "Will create 0 deletion commit(s) and 18 addition commit(s), totalling 18 atomic operations.\n",
      "Multi-commits strategy with ID c0d23037c50ff24f17fe6205fe8d24953f66f9bb1a5fe08d9c3f554df126b183.\n",
      "New PR created: https://huggingface.co/datasets/rdpahalavan/UNSW-NB15/discussions/6\n",
      "  step e5d7d65f25be74ce89fff63a6b96b95782c5dbdf47e8e1716009d478c7b572ce completed (still 17 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2d53b005f0344f4d8c2f7b0b77f588a1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_10.parquet:   0%|          | 0.00/2.65G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 2ef2628c74ccc33972d258e1cac6b6c02cab17b2401fd61743e613580847d5a6 completed (still 16 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bc80cf39bad04b1abd3bf88cb0427ebe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_11.parquet:   0%|          | 0.00/2.65G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 84c65ef7a0d01daf9689d92167c7318f734c6230063c33511379028c40cd0d5b completed (still 15 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a3819d7f0126475a8f6d91740c251b98",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_12.parquet:   0%|          | 0.00/2.71G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 745c5b4c477e36bc4055be82d170b531c43d4e2634d46917a9936586833c708e completed (still 14 to go).\n",
      "  step 6b97d17426eb7cf353e0a6bfe9e24b94e74cbcb2c416a41bae3f2d27145acbbd completed (still 13 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0873f85c665543cabfe41e2dfe290924",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_14.parquet:   0%|          | 0.00/2.72G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 5d85060e987bade56ba3efc8cbfa7376c1a9665fe5979125ec1d8a62a21c37ef completed (still 12 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b6f14607feae4e4ab97b1faf5324012d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_15.parquet:   0%|          | 0.00/2.77G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step c4228cf15809d9ca0d0017481294defe69f379fec642044b7b17f1c7ffb60547 completed (still 11 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e4499e28747743a597c07c048727ce8b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_16.parquet:   0%|          | 0.00/2.73G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step aff20752c897767cbf949734c348a3e4f622cfec16cbee08129c9826481d885d completed (still 10 to go).\n",
      "  step db23a788bdb0f1a78e74c43073860dedf3cc7ba4d9953cb027ad05f04c6c7d7a completed (still 9 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4d513aebe9b84174b3f2575812a37ad3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_18.parquet:   0%|          | 0.00/2.60G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step ea971f8d3ba3a48b67a63354b6471bb644e76dac7055303715cf2053f3479b3c completed (still 8 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d93a624845bc43d38ddbabc89f82e594",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_2.parquet:   0%|          | 0.00/2.66G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 869d5af7aea9bc530e236623fef592d76329f2dd3341f312e1bc5dd05418aabd completed (still 7 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6329f1c293b64cd6abd95d3ec067c543",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_3.parquet:   0%|          | 0.00/2.62G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 89802d83893758ff601ee9bb4a71b5149aac0f835c952d6dbf360f7098cd3185 completed (still 6 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2597fa18df964223b9457176c9f609e0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_4.parquet:   0%|          | 0.00/2.69G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 2cbaec6226fc8e1e720fae55fb60994cfce8d4314c2ff104990b8d38d745d3d8 completed (still 5 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4a6ac93d189843beafb086fb95701c26",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_5.parquet:   0%|          | 0.00/2.66G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 57e026ef17fd2747ac6003ea775feba275357e16c1d5026ee9515cf426f92c3f completed (still 4 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b0d2b6d6a93948898b06e643fcbfe78e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_6.parquet:   0%|          | 0.00/2.59G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step adde2d0a4e91adf93eede7be3dd4f8a29b10753f4cd9d3b273fa43029553c955 completed (still 3 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3770430458cb4a5a8d7a06c4a99ccf50",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_7.parquet:   0%|          | 0.00/2.56G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 251846617191479bb706501f1e467ef30a880304d0fa503a7b2176eb2e3b260d completed (still 2 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b4e4421d89c74458974c19cae70f2539",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_8.parquet:   0%|          | 0.00/2.65G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step b9de4efa248aee15ae581c33e1ce9d02cbc06d5501588535d686be17d6c58c54 completed (still 1 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5591c74b0ed8429fb80a911ebaac17c1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Payload_Bytes_File_9.parquet:   0%|          | 0.00/2.57G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 7db7d90a32aae5af665f96af3682f59ba3c9281763c7d92402a12d9433e052ab completed (still 0 to go).\n",
      "All commits have been pushed.\n",
      "PR is now open for reviews.\n",
      "PR has been automatically merged (`merge_pr=True` was passed).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/datasets/rdpahalavan/UNSW-NB15/tree/main/Payload-Bytes/'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "api.upload_folder(\n",
    "            folder_path=\"./UNSW/Export/Payload-Bytes\",\n",
    "            repo_id=\"rdpahalavan/UNSW-NB15\",\n",
    "            repo_type=\"dataset\",\n",
    "            path_in_repo=\"Payload-Bytes/\",\n",
    "            allow_patterns=\"*.parquet\",\n",
    "            delete_patterns=\"*.parquet\",\n",
    "            multi_commits=True,\n",
    "            multi_commits_verbose=True\n",
    "        )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df13314c-e5b9-496d-9c06-753a17681ca5",
   "metadata": {},
   "source": [
    "# CSV to Parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b2f5ab4-c940-413d-a592-3b3b96c28613",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(3,19):\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>\\n')\n",
    "    df = pd.read_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', low_memory=False)\n",
    "    df = df.convert_dtypes()\n",
    "    # df.to_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', index=False)\n",
    "    df.to_parquet(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    write_log(f'------------ Payload Bytes Exported ------------')\n",
    "    df = pd.read_csv(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', low_memory=False)\n",
    "    df = df.convert_dtypes()\n",
    "    # df.to_csv(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', index=False)\n",
    "    df.to_parquet(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    write_log(f'------------ Packet Bytes Exported ------------')\n",
    "    df = pd.read_csv(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv', low_memory=False)\n",
    "    df = df.convert_dtypes()\n",
    "    # df.to_csv(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv', index=False)\n",
    "    df.to_parquet(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    write_log(f'------------ Packet Fields Exported ------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "37bee2df-20d6-4971-ba38-46f9bd4c65a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "all_columns = set()\n",
    "\n",
    "for i in range(1,19):\n",
    "    df = pd.read_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', nrows=0)\n",
    "    all_columns = all_columns.union(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d9e6b6b9-ce0f-4d5d-abab-b3195b4b67b4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1485"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(all_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "44824e4f-7793-42f4-b2c2-8995cd117290",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: 1\n",
      "Completed: 2\n",
      "Completed: 3\n",
      "Completed: 4\n",
      "Completed: 5\n",
      "Completed: 6\n",
      "Completed: 7\n",
      "Completed: 8\n",
      "Completed: 9\n",
      "Completed: 10\n",
      "Completed: 11\n",
      "Completed: 12\n",
      "Completed: 13\n",
      "Completed: 14\n",
      "Completed: 15\n",
      "Completed: 16\n",
      "Completed: 17\n",
      "Completed: 18\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,19):\n",
    "    df = pd.read_parquet(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet')\n",
    "    missing_cols = all_columns - set(df.columns)\n",
    "    if len(missing_cols) == 0:\n",
    "        print(f\"Completed: {i}\")\n",
    "        continue\n",
    "    df_missing_cols = pd.DataFrame({col: np.nan for col in missing_cols}, index=df.index)\n",
    "    df_missing_cols = df_missing_cols.convert_dtypes()\n",
    "    df = pd.concat([df, df_missing_cols], axis=1)\n",
    "    df.to_parquet(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet', index=False)\n",
    "    del df_missing_cols\n",
    "    del df\n",
    "    print(f\"Completed: {i}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "89b04978-b852-4d93-a999-b6f9fd47e0f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet('UNSW/Export/Payload-Bytes/Payload_Bytes_File_1.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "80a0c665-f70a-4a9f-a073-92bae35feb94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>packet_id</th>\n",
       "      <th>flow_id</th>\n",
       "      <th>source_ip</th>\n",
       "      <th>source_port</th>\n",
       "      <th>destination_ip</th>\n",
       "      <th>destination_port</th>\n",
       "      <th>protocol</th>\n",
       "      <th>payload_length</th>\n",
       "      <th>payload_byte_1</th>\n",
       "      <th>payload_byte_2</th>\n",
       "      <th>...</th>\n",
       "      <th>payload_byte_1468</th>\n",
       "      <th>payload_byte_1469</th>\n",
       "      <th>payload_byte_1470</th>\n",
       "      <th>payload_byte_1471</th>\n",
       "      <th>payload_byte_1472</th>\n",
       "      <th>payload_byte_1473</th>\n",
       "      <th>payload_byte_1474</th>\n",
       "      <th>payload_byte_1475</th>\n",
       "      <th>payload_byte_1476</th>\n",
       "      <th>attack_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>117</td>\n",
       "      <td>10.40.182.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>44</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>118</td>\n",
       "      <td>10.40.85.1</td>\n",
       "      <td>0</td>\n",
       "      <td>224.0.0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>ospf</td>\n",
       "      <td>44</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33</td>\n",
       "      <td>6</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>111</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>32119</td>\n",
       "      <td>udp</td>\n",
       "      <td>48</td>\n",
       "      <td>54</td>\n",
       "      <td>239</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>36</td>\n",
       "      <td>2</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>udp</td>\n",
       "      <td>72</td>\n",
       "      <td>87</td>\n",
       "      <td>198</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>37</td>\n",
       "      <td>2</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>1024</td>\n",
       "      <td>59.166.0.0</td>\n",
       "      <td>33661</td>\n",
       "      <td>udp</td>\n",
       "      <td>24</td>\n",
       "      <td>33</td>\n",
       "      <td>153</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2503011</th>\n",
       "      <td>9999929</td>\n",
       "      <td>115904</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>51456</td>\n",
       "      <td>149.171.126.3</td>\n",
       "      <td>6881</td>\n",
       "      <td>tcp</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2503012</th>\n",
       "      <td>9999932</td>\n",
       "      <td>115900</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>31666</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>48</td>\n",
       "      <td>38</td>\n",
       "      <td>26</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2503013</th>\n",
       "      <td>9999946</td>\n",
       "      <td>115900</td>\n",
       "      <td>59.166.0.3</td>\n",
       "      <td>31666</td>\n",
       "      <td>149.171.126.5</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>48</td>\n",
       "      <td>178</td>\n",
       "      <td>44</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2503014</th>\n",
       "      <td>9999987</td>\n",
       "      <td>115922</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>616</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2503015</th>\n",
       "      <td>9999988</td>\n",
       "      <td>115922</td>\n",
       "      <td>59.166.0.8</td>\n",
       "      <td>4793</td>\n",
       "      <td>149.171.126.9</td>\n",
       "      <td>22</td>\n",
       "      <td>tcp</td>\n",
       "      <td>28</td>\n",
       "      <td>83</td>\n",
       "      <td>83</td>\n",
       "      <td>...</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2503016 rows × 1485 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         packet_id  flow_id      source_ip  source_port destination_ip  \\\n",
       "0                1      117    10.40.182.1            0      224.0.0.5   \n",
       "1                2      118     10.40.85.1            0      224.0.0.5   \n",
       "2               33        6  149.171.126.9          111     59.166.0.0   \n",
       "3               36        2  149.171.126.9         1024     59.166.0.0   \n",
       "4               37        2  149.171.126.9         1024     59.166.0.0   \n",
       "...            ...      ...            ...          ...            ...   \n",
       "2503011    9999929   115904     59.166.0.8        51456  149.171.126.3   \n",
       "2503012    9999932   115900     59.166.0.3        31666  149.171.126.5   \n",
       "2503013    9999946   115900     59.166.0.3        31666  149.171.126.5   \n",
       "2503014    9999987   115922     59.166.0.8         4793  149.171.126.9   \n",
       "2503015    9999988   115922     59.166.0.8         4793  149.171.126.9   \n",
       "\n",
       "         destination_port protocol  payload_length  payload_byte_1  \\\n",
       "0                       0     ospf              44               2   \n",
       "1                       0     ospf              44               2   \n",
       "2                   32119      udp              48              54   \n",
       "3                   33661      udp              72              87   \n",
       "4                   33661      udp              24              33   \n",
       "...                   ...      ...             ...             ...   \n",
       "2503011              6881      tcp              17               0   \n",
       "2503012                22      tcp              48              38   \n",
       "2503013                22      tcp              48             178   \n",
       "2503014                22      tcp             616               0   \n",
       "2503015                22      tcp              28              83   \n",
       "\n",
       "         payload_byte_2  ...  payload_byte_1468  payload_byte_1469  \\\n",
       "0                     1  ...               <NA>               <NA>   \n",
       "1                     1  ...               <NA>               <NA>   \n",
       "2                   239  ...               <NA>               <NA>   \n",
       "3                   198  ...               <NA>               <NA>   \n",
       "4                   153  ...               <NA>               <NA>   \n",
       "...                 ...  ...                ...                ...   \n",
       "2503011               0  ...               <NA>               <NA>   \n",
       "2503012              26  ...               <NA>               <NA>   \n",
       "2503013              44  ...               <NA>               <NA>   \n",
       "2503014               0  ...               <NA>               <NA>   \n",
       "2503015              83  ...               <NA>               <NA>   \n",
       "\n",
       "         payload_byte_1470  payload_byte_1471  payload_byte_1472  \\\n",
       "0                     <NA>               <NA>               <NA>   \n",
       "1                     <NA>               <NA>               <NA>   \n",
       "2                     <NA>               <NA>               <NA>   \n",
       "3                     <NA>               <NA>               <NA>   \n",
       "4                     <NA>               <NA>               <NA>   \n",
       "...                    ...                ...                ...   \n",
       "2503011               <NA>               <NA>               <NA>   \n",
       "2503012               <NA>               <NA>               <NA>   \n",
       "2503013               <NA>               <NA>               <NA>   \n",
       "2503014               <NA>               <NA>               <NA>   \n",
       "2503015               <NA>               <NA>               <NA>   \n",
       "\n",
       "         payload_byte_1473  payload_byte_1474  payload_byte_1475  \\\n",
       "0                     <NA>               <NA>               <NA>   \n",
       "1                     <NA>               <NA>               <NA>   \n",
       "2                     <NA>               <NA>               <NA>   \n",
       "3                     <NA>               <NA>               <NA>   \n",
       "4                     <NA>               <NA>               <NA>   \n",
       "...                    ...                ...                ...   \n",
       "2503011               <NA>               <NA>               <NA>   \n",
       "2503012               <NA>               <NA>               <NA>   \n",
       "2503013               <NA>               <NA>               <NA>   \n",
       "2503014               <NA>               <NA>               <NA>   \n",
       "2503015               <NA>               <NA>               <NA>   \n",
       "\n",
       "         payload_byte_1476  attack_label  \n",
       "0                     <NA>        normal  \n",
       "1                     <NA>        normal  \n",
       "2                     <NA>        normal  \n",
       "3                     <NA>        normal  \n",
       "4                     <NA>        normal  \n",
       "...                    ...           ...  \n",
       "2503011               <NA>        normal  \n",
       "2503012               <NA>        normal  \n",
       "2503013               <NA>        normal  \n",
       "2503014               <NA>        normal  \n",
       "2503015               <NA>        normal  \n",
       "\n",
       "[2503016 rows x 1485 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "752c3b98-6b83-43a0-967a-7ec672da854e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1485"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5fb6e18e-aa0f-4932-8e8d-c79ac93a91ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: Packet_Fields_File_1.parquet\n",
      "Completed: Packet_Fields_File_2.parquet\n",
      "Completed: Packet_Fields_File_3.parquet\n",
      "Completed: Packet_Fields_File_4.parquet\n",
      "Completed: Packet_Fields_File_5.parquet\n",
      "Completed: Packet_Fields_File_6.parquet\n",
      "Completed: Packet_Fields_File_7.parquet\n",
      "Completed: Packet_Fields_File_8.parquet\n",
      "Completed: Packet_Fields_File_9.parquet\n",
      "Completed: Packet_Fields_File_10.parquet\n",
      "Completed: Packet_Fields_File_11.parquet\n",
      "Completed: Packet_Fields_File_12.parquet\n",
      "Completed: Packet_Fields_File_13.parquet\n",
      "Completed: Packet_Fields_File_14.parquet\n",
      "Completed: Packet_Fields_File_15.parquet\n",
      "Completed: Packet_Fields_File_16.parquet\n",
      "Completed: Packet_Fields_File_17.parquet\n",
      "Completed: Packet_Fields_File_18.parquet\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import pyarrow.parquet as pq\n",
    "from collections import defaultdict\n",
    "\n",
    "files = [f\"Packet_Fields_File_{i}.parquet\" for i in range(1,19)]\n",
    "\n",
    "column_types = defaultdict(list)\n",
    "\n",
    "for file in files:\n",
    "    df = pd.read_parquet(\"./UNSW/Export/Packet-Fields/\"+file)\n",
    "    for column, dtype in df.dtypes.items():\n",
    "        column_types[column].append(dtype)\n",
    "    print(f\"Completed: {file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "951354a6-5a07-4007-982e-d1faaa90d96c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "majority_column_types = {}\n",
    "\n",
    "for column, types in column_types.items():\n",
    "    counter = Counter(types)\n",
    "    majority_type = counter.most_common(1)[0][0]\n",
    "    majority_column_types[column] = majority_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "49ef25ef-f2d2-43a3-a0ca-44a7f2c7fc3a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: 1\n",
      "Completed: 2\n",
      "Completed: 3\n",
      "Completed: 4\n",
      "Completed: 5\n",
      "Completed: 6\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "IOStream.flush timed out\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: 7\n",
      "Completed: 8\n",
      "Completed: 9\n",
      "Completed: 10\n",
      "Completed: 11\n",
      "Completed: 12\n",
      "Completed: 13\n",
      "Completed: 14\n",
      "Completed: 15\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "IOStream.flush timed out\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: 16\n",
      "Completed: 17\n",
      "Completed: 18\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,19):\n",
    "    df = pd.read_parquet(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.parquet')\n",
    "    for column, dtype in df.dtypes.items():\n",
    "        majority_type = majority_column_types[column]\n",
    "        if dtype != majority_type:\n",
    "            df[column] = df[column].astype(majority_type)\n",
    "    df.to_parquet(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    print(f\"Completed: {i}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
